Skip to content

Instantly share code, notes, and snippets.

@qzcool
Created May 3, 2018 05:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save qzcool/bf0d8d70e30df1d56162687bc9db759b to your computer and use it in GitHub Desktop.
Save qzcool/bf0d8d70e30df1d56162687bc9db759b to your computer and use it in GitHub Desktop.
公司标准名称清洗
from bs4 import BeautifulSoup
import requests, datetime
from tqdm import *
import pandas as pd
engine_bing = 'https://cn.bing.com/search?q=site%3Atianyancha.com+'
engine_baidu = 'https://www.baidu.com/s?wd=site%3A%20tianyancha.com%20'
def get_exact_name(name_list, name_column):
df = pd.read_excel(name_list,encoding='gb18030')
df2 = pd.DataFrame(columns=['partyID','Name_product','Name_af','Name_be','Consistency'])
for i in tqdm(range(len(df))):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F'}
r = requests.get(engine_bing+df[name_column][i], headers=headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text,'lxml')
df2 = df2.append({'partyID':str(df['partyID'][i]).split('.')[0],'Name_product':df['产品名称'][i],'Name_af':soup.find('a',target='_blank').get_text().split('_')[0].split(' - ')[0],'Name_be':df[name_column][i]},ignore_index=True)
if df2.Name_af.iloc[i] == df2.Name_be.iloc[i]:
df2.Consistency.iloc[i] = 'True'
else:
df2.Consistency.iloc[i] = 'False'
df2.to_excel('Name_list_'+str(datetime.datetime.now())[:19].replace('-','').replace(' ','_').replace(':','_')+'.xlsx',encoding='gb18030')
#print (df2)
print (len(df2[df2['Consistency'] == 'False'])/len(df2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment