Created
May 18, 2017 09:54
-
-
Save alexlamazing/399bf72330f5c52c63776d0a10aece86 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import urllib2 library 用作 query 網頁 | |
import urllib2 | |
# import Beautiful soup 用作 parse 回傳頁面的 html | |
from bs4 import BeautifulSoup | |
#import pandas to convert list to data frame | |
import pandas as pd | |
import csv | |
for pageNo in range(1,1103): | |
locatefamily = "http://www.locatefamily.com/Street-Lists/Hong-Kong/index-" + str(pageNo) + ".html" | |
page = urllib2.urlopen(locatefamily) | |
soup = BeautifulSoup(page, "html.parser") | |
right_tables=soup.findAll('table', class_='table') | |
#Generate lists | |
A=[] | |
B=[] | |
C=[] | |
D=[] | |
E=[] | |
for table in right_tables: | |
for body in table.findAll("tbody"): | |
for row in body.findAll("tr"): | |
cells = row.findAll('td') | |
people = row.findAll('th') | |
if len(cells)==3: | |
A.append(pageNo) | |
B.append(people[0].find(text=True)) | |
C.append(cells[0]) | |
D.append(cells[1].find(text=True)) | |
E.append(cells[2].find(text=True)) | |
df=pd.DataFrame(A,columns=['PageNo']) | |
df['No']=B | |
df['Name']=C | |
df['Address']=D | |
df['Telephone']=E | |
pageNum = '{:04d}'.format(pageNo) | |
df.to_csv('page'+str(pageNum)+'.csv', sep=',', encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment