pedrotnascimento/ansi_or_utf8_text_processing.py

## ansi_or_utf8_text_processing.py
import sys
try:
  FILE_PATH =sys.argv[1]
except IndexError:
  print("need pass a file as input parameter\npython my_script.py my_csv_file.csv")
  exit()

def predict_encoding(file_path, n_lines=20):
    '''Predict a file's encoding using chardet'''
    import chardet

    # Open the file as binary data
    with open(file_path, 'rb') as f:
        # Join binary lines for specified number of lines
        rawdata = b''.join([f.readline() for _ in range(n_lines)])
    a = chardet.detect(rawdata)['encoding']

    return chardet.detect(rawdata)['encoding']

ANSI_CODE = "ISO-8859-1"
FILE_ENCODE = predict_encoding(FILE_PATH)

def ansiToUtf8(string, FILE_ENCODE):
    if ANSI_CODE == FILE_ENCODE:
        return unicode(string, "cp1252")
    else:
        return string.decode("utf-8")
	import sys
	try:
	FILE_PATH =sys.argv[1]
	except IndexError:
	print("need pass a file as input parameter\npython my_script.py my_csv_file.csv")
	exit()

	def predict_encoding(file_path, n_lines=20):
	'''Predict a file's encoding using chardet'''
	import chardet

	# Open the file as binary data
	with open(file_path, 'rb') as f:
	# Join binary lines for specified number of lines
	rawdata = b''.join([f.readline() for _ in range(n_lines)])
	a = chardet.detect(rawdata)['encoding']

	return chardet.detect(rawdata)['encoding']

	ANSI_CODE = "ISO-8859-1"
	FILE_ENCODE = predict_encoding(FILE_PATH)

	def ansiToUtf8(string, FILE_ENCODE):
	if ANSI_CODE == FILE_ENCODE:
	return unicode(string, "cp1252")
	else:
	return string.decode("utf-8")