แปลงไฟล์ จาก utf8 กลับเป็น tis620

 

มีงานต้องแปลงไฟล์กลับ เลยเขียนโค๊ดอีกทีนึง
( งานที่ทำคือ สันติรำลึก เป็นการแปลงไฟล์กลับจาก Word มาเป็น HTML แบบ tis-620 )

#!/usr/bin/env python
# CONVERT FILE CONTENT FROM utf8 TO tis620
import sys,os

# GLOBAL VARS
decodec="utf8"
encodec="cp874"
#
# VARIABLE decodec AND encodec CAN BE CHANGED.
# ALL STANDARD ENCODINGS IS:
# ascii, big5, big5hkscs, cp037, cp424, cp437, cp500, cp737, cp775, cp850,
# cp852, cp855, cp856, cp857, cp860, cp861, cp862, cp863, cp864, cp865, 
# cp866, cp869, cp874, cp875, cp932, cp949, cp950, cp1006, cp1026, cp1140,
# cp1250, cp1251, cp1252, cp1253, cp1254, cp1255, cp1256, cp1257, cp1258,
# euc_jp, euc_jis_2004, euc_jisx0213, euc_kr, gb2312, gbk, gb18030, hz,
# iso2022_jp, iso2022_jp_1, iso2022_jp_2, iso2022_jp_2004, iso2022_jp_3,
# iso2022_jp_ext, iso2022_kr, latin_1, iso8859_2, iso8859_3, iso8859_4,
# iso8859_5, iso8859_6, iso8859_7, iso8859_8, iso8859_9, iso8859_10,
# iso8859_13, iso8859_14, iso8859_15, johab, koi8_r, koi8_u, mac_cyrillic,
# mac_greek, mac_iceland, mac_latin2, mac_roman, mac_turkish, ptcp154,
# shift_jis, shift_jis_2004, shift_jisx0213, utf_16, utf_16_be, utf_16_le,
# utf_7, utf_8, utf_8_sig
#
# SEE http://docs.python.org/lib/standard-encodings.html
# FOR MORE INFORMATION.


def usage(progname):
    print "Usage: %s FILE" % (progname)
    print "Convert FILE from %s to %s, save old file in FILE.bak" % (decodec,encodec)


def cannotopenfile(filename):
    print "Cannot open file %s" % (filename)


def genfilename(filename="",ext="new"):
    if filename=="":
        return ""
    #
    if ext.lower()=="new":
        ext="new"
    #
    if ext.lower()!="new" and ext.lower()!="bak":
        ext="bak"
    #
    if os.path.exists(filename+"."+ext):
        i=0
        while os.path.exists(filename+"."+ext+str(i)) and (i < 1000):
            i=i+1
        #
        if i>999:
            return ""
        #
        return filename+"."+ext+str(i)
    else:
        return filename+"."+ext
    #


def replace_invalid_char(line,utf_char,tis_char):
    return line.replace(utf_char,tis_char)


def convertline(line):
    # CHECK INVALID CHAR
    line=replace_invalid_char(line,"\xe2\x80\x98","'")
    line=replace_invalid_char(line,"\xe2\x80\x99","'")
    line=replace_invalid_char(line,"\xe2\x80\x9c",'"')
    line=replace_invalid_char(line,"\xe2\x80\x9d",'"')
    line=replace_invalid_char(line,"\xe2\x80\xa6","...")
    line=replace_invalid_char(line,"\xef\x9c\x8f","\xe0\xb8\x8d") #YOR YING
    line=replace_invalid_char(line,"\xef\x9c\x9a","\xe0\xb8\xba") #PINTU
    line=replace_invalid_char(line,"\xe2\x80\x93","-")
    line=replace_invalid_char(line,"\xef\x82\xae","->")
    line=replace_invalid_char(line,"\xef\xa3\x82","") # UNKNOWN
    line=replace_invalid_char(line,"\xef\xa3\x83","") # UNKNOWN
    return line.decode(decodec).encode(encodec)


def convertfile(fs_old, fs_new):
    for eachline in fs_old:
        newline=convertline(eachline)
#        try:
#            newline=convertline(eachline)
#        except:
#            newline=eachline
#        #
        fs_new.write(newline)
    #
    return True
    

if __name__=="__main__":
    progname=os.path.basename(sys.argv[0])
    try:
         oldfile=sys.argv[1]
    except:
         usage(progname)
         sys.exit(1)
    #
    try:
         fsold=open(oldfile)
    except:
         cannotopenfile(oldfile)
         sys.exit(1)
    #
    newfile=genfilename(oldfile,"new")
    if newfile=="":
         print "Cannot save backup file"
         sys.exit(1)
    #
    try:
         fsnew=open(newfile,"w")
    except:
         cannotopenfile(newfile)
         sys.exit(1)
    #
    if convertfile(fsold,fsnew)==False:
         fsold.close()
         fsnew.close()
         print "Convert file %s faild" % (oldfile)
         sys.exit(1)
    #
    fsold.close()
    fsnew.close()
    bakfile=genfilename(oldfile,"bak")
    if bakfile=="":
         print "Cannot create bakup file, so utf8-file is %s" % (newfile)
         sys.exit(1)
    #
    os.rename(oldfile,bakfile)
    os.rename(newfile,oldfile)
    print "Convert %s success, save backup file in %s" % (oldfile,bakfile)

โค๊ดยังไม่เรียบร้อยดี แต่ขอแปะโค๊ดไว้ก่อน