python: ลองเขียนสคริปต์ลบสแปม

ลองเขียนสคริปต์ลบแสปม
ใช้กับบอร์ด yabbse กับ smf

ของ smf ยังไม่เสร็จ บันทึกเอาไว้เพื่อลองดูผลเท่านั้น
ต้องเปลี่ยนแปลงสคริปต์ตามธีมที่ใช้ด้วย

*** สคริปต์นี้ใช้กับ thailinuxhosting.com/yabbse เท่านั้น เพราะใส่โค๊ดที่แก้ปัญหาบอร์ดไว้ด้วยครับ

#!/usr/bin/env python
# -*- coding: utf-8 -*-

user = "wd"
password = "mypassword"
enc_password = "XXXXXXXXXX"    # *** GET ENCRYPTED PASSWORD FROM BROWSER COOKIE
site = "http://www.thailinuxhosting.com/yabbse" #"http://www.thaitux.info/smf"
board = "yabbse"     # "smf", "yabbse"
charset = "tis620"   # "utf8", "tis620"
max_loop = 5         # = RECENT LIST OF BOARD
root = "/home/wd/spam"
backup_file = root+"/thailinuxhosting-bak.txt"
spamtext_file = root+"/spamlist.txt"
cookie_file = root+"/thailinuxhosting-cookie"

import sys
import os
import time

##### PRE RUN FOR RETRIEVE COOKIE #####
import urllib2
import cookielib
login = "/index.php?action=login2;user=%s;passwrd=%s;cookielength=302400" % (user, password,)
cj = cookielib.MozillaCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
sock = opener.open(site+login)
cj.save(cookie_file, ignore_discard=True, ignore_expires=True)
sock.close()
#######################################

spamlist = []

def decoding(txt):
  if charset == "tis620":
    return txt.decode("utf8").encode("tis620")
  elif charset == "utf8":
    return txt
  else:
    print "Error, CHARSET is not defined"
    sys.exit[0]

def search_line(txt, l, occur=1):
  for i in range(len(l)):
    if txt in l[i]:
      if occur > 1:
        occur=occur-1
      else:
        return i
  return -1

def get_msgid(url):
  if board == "smf":          # ...#msgXX
    return url.split("#msg")[-1]
  elif board == "yabbse":     # ...;start=XX
    return url.split(";start=")[-1]

def check_spam(txt):
  global spamlist
  for i in spamlist:
    if i in txt:
      return True, i
  return False, ''

def save_backup(txt):
  f = open(backup_file,'a')
  f.write(txt+'\n\n\n')
  f.close()
  return

def die_board():
  print "board not exist"
  sys.exit[0]

if board == "smf":
  #     recent_str =    "กระททู้เมมื่อเร็วๆ นนี้"
  recent_str = "\xe0\xb8\x81\xe0\xb8\xa3\xe0\xb8\xb0\xe0\xb8\x97\xe0\xb8\xb9\
\xe0\xb9\x89\xe0\xb9\x80\xe0\xb8\xa1\xe0\xb8\xb7\xe0\xb9\x88\xe0\xb8\xad\
\xe0\xb9\x80\xe0\xb8\xa3\xe0\xb9\x87\xe0\xb8\xa7\xe0\xb9\x86 \xe0\xb8\x99\
\xe0\xb8\xb5\xe0\xb9\x89"
elif board == "yabbse":
  #     recent_str = "โพสต์เมมื่อเร็วๆนนี้"
  recent_str = "\xe0\xb9\x82\xe0\xb8\x9e\xe0\xb8\xaa\xe0\xb8\x95\xe0\xb9\x8c\
\xe0\xb9\x80\xe0\xb8\xa1\xe0\xb8\xb7\xe0\xb9\x88\xe0\xb8\xad\xe0\xb9\x80\
\xe0\xb8\xa3\xe0\xb9\x87\xe0\xb8\xa7\xe0\xb9\x86\xe0\xb8\x99\xe0\xb8\xb5\xe0\xb9\x89"
else:
  die_board()

#LOAD SPAM DATA
if not os.path.exists(spamtext_file):
  f = open(spamtext_file,'w')
  f.close()

f = open(spamtext_file)
for i in f:
  if i!='' and len(i)>3:
    spamlist.append(decoding(i.strip()))

f.close()
  
recent_str = decoding(recent_str)
    
    
#INIT COOKIE & OPENER
cj = cookielib.MozillaCookieJar()
cj.load(cookie_file)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    
loop_count = 0
url_list_pair = []    #SOLVE yabbse ONLY INDEX TO LAST MESSAGE, SO WE CREATE OUR OWN
while loop_count < max_loop:
  #FIRST PAGE
  sock = opener.open(site)
  #HACK: SOLVE yabbse'S BOARD COOKIE ERROR
  cj._cookies['thailinuxhosting.com']['/yabbse']['YaBBSE140usernamev14'].value = user
  cj._cookies['thailinuxhosting.com']['/yabbse']['YaBBSE140passwordv14'].value = enc_password
  html = sock.read()
  sock.close()
  
  l = html.split('\n')

  #SESSIONID
  if board == "smf":
    sstr = "sesc="
    line = search_line(sstr, l)
    if line < 0:
      sys.exit[0]
    session_id = l[line].split(sstr)[1].split('">')[0]
  else:
    session_id = ""
  
  #SEARCH FOR RECENT POST
  sstr = recent_str
  line = search_line(sstr, l)
  if line < 0:
    sys.exit[0]
   
  if board == "smf":
    url = l[line+9+loop_count].split('<a href="')[1].split('">')[0]
    author = ""
    date_submitted = ""
  elif board == "yabbse":
    url = l[line+4+loop_count].split('<td valign="top"><a href="')[1].split('">')[0]
    url_list = [ i[0] for i in url_list_pair ]    # SOLVE yabbse MESSAGE INDEX
    if url in url_list:
      i = url_list.index(url)
      url_list_pair[i][1] += 1
      index_dec =  url_list_pair[i][1]
    else:
      url_list_pair.append([url,0])
      index_dec = 0
    #   tmp = 'โดย '
    tmp = decoding('\xe0\xb9\x82\xe0\xb8\x94\xe0\xb8\xa2 ')
    author = l[line+4+loop_count].split(tmp)[1].split('</td>')[0]
  else:
    die_board()

  msgid = get_msgid(url)
  sock = opener.open(url)
  html = sock.read()
  sock.close()
  l = html.split('\n')

  #PARSE HTML
  is_spam = False
  spam_keyword = ''
  if board == "smf":
    sstr = "msg_%s" % (msgid,)
    line = search_line(sstr, l)
  elif board == "yabbse":
    sstr = '<hr width="100%" size="1" class="windowbg3">'
    count = (int(msgid)-index_dec) % 20 + 1   # 20 MESSAGES PER PAGE - yabbse INDEX DECREMENT
    print 'loop=',loop_count,' /// count=',count
    line = search_line(sstr, l, count)
    tmp = decoding("javascript:DoConfirm('")
    try:
      delete_url = l[line-3].split(tmp)[1].split("','")[1].split("""');"><img src""")[0]
      date_submitted = l[line-4].split('</B> ')[1].split(' &#187;')[0]
      title = l[line-5].split('<B>')[1].split('</b>')[0]
      process_line = line+1
      is_spam, spam_keyword = check_spam(l[process_line])
      if is_spam:
        print 'line=',line,' /// l[line-3]=', l[line-3]
        print 'delete_url=',delete_url
        print "is_spam=",is_spam," /// keyword=",spam_keyword," /// line=",l[process_line]
    except:
      is_spam = False
  else:
    die_board()

  if is_spam:
    if board == "smf":
      pass
    elif board == "yabbse":
      save_backup('delete url: '+delete_url+\
        '\nspam keyword: '+spam_keyword+\
        '\nscan date: '+time.ctime(time.time())+\
        '\ntitle: '+title+\
        '\nauthor: '+author+\
        '\nsubmitted date: '+date_submitted+\
        '\n'+l[process_line])
      sock = opener.open(delete_url)
      sock.close()
      url_list = [ i[0] for i in url_list_pair ]    # RESET yabbse MESSAGE INDEX
      if url in url_list:
        i = url_list.index(url)
        url_list_pair.remove(url_list_pair[i])
  else:
    loop_count = loop_count+1