Last active
July 12, 2024 01:53
-
-
Save iOS0x00/1e370da2e6ec99ce479a to your computer and use it in GitHub Desktop.
2000w数据csv入库修正版,基于网上的版本修正,不要太在意代码风格 :doge:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
import os | |
import csv | |
import MySQLdb as mysql | |
inpath = '.' | |
uipath = unicode(inpath, "utf-8") | |
files = os.listdir(uipath) # 存放2000万开房数据的路径 | |
print files | |
def validate(line): | |
drop = False | |
if line[0] == '\xf0\xa6\x8d\x8b\xe5\xa4\xa7\xe7\xb6\xad': | |
return True | |
if len(line[2]) > 300: | |
print 'Desciot illegal', line[2] | |
drop = True | |
elif len(line[5]) > 4: | |
print 'Gender illegal', line[5] | |
drop = True | |
elif len(line[20]) > 80: | |
print 'Tel illegal', line[20] | |
drop = True | |
elif len(line[21]) > 80: | |
print 'Fax illegal', line[21] | |
drop = True | |
elif len(line[24]) > 300: | |
print 'Taste illegal', line[24] | |
drop = True | |
# if drop: | |
# print '---------- ', line | |
return drop | |
try: | |
conn = mysql.connect(host="localhost", user="root", passwd="123456", port=3306) | |
conn.set_character_set('utf8') | |
cur = conn.cursor() | |
cur.execute('set names utf8') | |
cur.execute('SET CHARACTER SET utf8') | |
cur.execute('SET character_set_connection=utf8') | |
sql = "CREATE DATABASE IF NOT EXISTS HOTELDATA DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci" | |
cur.execute(sql) | |
cur.execute("USE HOTELDATA") | |
cur.execute("DROP TABLE IF EXISTS KAIFANG") | |
sql = ''' CREATE TABLE KAIFANG( | |
Name varchar(50), | |
CardNo varchar(30), | |
Descriot varchar(300), | |
CtfTp varchar(30), | |
CtfId varchar(50), | |
Gender varchar(4), | |
Birthday varchar(30), | |
Address varchar(100), | |
Zip varchar(30), | |
Dirty varchar(30), | |
District1 varchar(30), | |
District2 varchar(30), | |
District3 varchar(30), | |
District4 varchar(30), | |
District5 varchar(30), | |
District6 varchar(30), | |
FirstNm varchar(100), | |
LastNm varchar(100), | |
Duty varchar(30), | |
Mobile varchar(100), | |
Tel varchar(80), | |
Fax varchar(80), | |
EMail varchar(100), | |
Nation varchar(30), | |
Taste varchar(300), | |
Education varchar(30), | |
Company varchar(50), | |
CTel varchar(100), | |
CAddress varchar(50), | |
CZip varchar(30), | |
Family varchar(30), | |
Version varchar(30), | |
id varchar(30) | |
) ENGINE=InnoDB DEFAULT CHARSET=utf8''' | |
cur.execute(sql) | |
for this_file in files: # 循环处理文件 | |
# file = uipath + '\\' + file | |
print this_file # 输出文件名 | |
if not this_file.endswith('.csv'): | |
continue | |
fp = open(this_file, "rU") | |
lines = csv.reader(fp) | |
data = [] | |
i = 0 | |
j = 0 | |
for line in lines: | |
i += 1 | |
if i == 1: | |
continue # 每个文件的第一行不处理 | |
datalen = len(line) | |
if datalen != 33: | |
if datalen == 8: | |
line.extend([''] * 25) | |
elif datalen == 26: | |
line.extend([''] * 7) | |
else: | |
print '++++++++ERROR++++++ ', i, datalen, line # 输出长度不对的行 | |
continue | |
if not validate(line): | |
data.append(line) | |
j += 1 | |
# print data[0][0], data[0][7] | |
# 50w记录先入一次 | |
if j >= 500000: | |
print '50w record in' | |
cur.executemany("INSERT INTO KAIFANG values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s," | |
"%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s, %s, %s)", data) | |
conn.commit() | |
j = 0 | |
data = [] | |
# 不满50w的入 | |
if j > 0: | |
print j, 'record in' | |
cur.executemany("INSERT INTO KAIFANG values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s," | |
"%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s, %s, %s, %s)", data) | |
conn.commit() | |
cur.close() | |
conn.close() | |
except mysql.Error, e: | |
print "Error Mysql %d:%s" % (e.args[0], e.args[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
fp = open('a.html', 'r') | |
data = fp.read() | |
fp.close() | |
lines = data.split('\n') | |
#print len(lines) | |
print 'INSERT INTO KAIFANG(`Name`, `CtfTp`, `CtfId`, `Birthday`, `Mobile`, `Address`) VALUES' | |
for l in lines: | |
a = eval(l) | |
name = a[0].split()[0].strip() if a[0] else '' | |
if not name: | |
continue | |
try: | |
id_index = a.index('ID') | |
id_txt = a[id_index + 1] | |
address = a[id_index + 4] | |
except ValueError: | |
id_txt = '' | |
except IndexError: | |
address = '' | |
birthday = '' | |
for i in a: | |
if len(i) == 8: | |
birthday = i | |
break | |
mobile = '' | |
for i in a: | |
if len(i) == 11: | |
mobile = i | |
break | |
print "('{name}', 'ID', '{id_txt}', '{birthday}', '{mobile}', '{address}'),".format(name=name, | |
birthday=birthday, id_txt=id_txt, address=address, mobile=mobile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment