Created
March 23, 2012 04:32
-
-
Save wynemo/2166836 to your computer and use it in GitHub Desktop.
check file containing chinese,but not in utf-8 format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:utf-8 | |
suffix_list = ['.cpp','.c','.h','.hpp','.txt','.html','.htm','.xml','.py'] | |
def FoundInvalidFile(input_folder):#str,list | |
def calc_bom(str1): | |
rt = '' | |
for i in range(0,3): | |
rt += '%02X'%(ord(str1[i])) | |
return rt | |
def walk(str1): | |
import os,os.path | |
str1 = os.path.abspath(str1) | |
for file in [file for file in os.listdir(str1) if not file in [".",".."]]: | |
nfile = os.path.join(str1,file) | |
if os.path.isdir(nfile) is True: | |
walk(nfile) | |
pass | |
else: | |
nfile = nfile.replace('\\','/') | |
file_suffix = None | |
for each in suffix_list: | |
if nfile.lower().endswith(each) is True: | |
file_suffix = each | |
break | |
if file_suffix is None: | |
continue | |
#file1 = nfile.replace(input_folder + '/','',1) | |
#print file1 | |
f1 = open(nfile,'rb') | |
s1 = f1.read() | |
try: | |
if len(s1) > 3: | |
if 'EFBBBF' == calc_bom(s1[0:4]): | |
s1[3:].decode('utf-8') | |
continue | |
s1.decode('utf-8') | |
except Exception,e: | |
print nfile,str(e) | |
walk(input_folder) | |
if __name__ == "__main__": | |
FoundInvalidFile('.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment