Skip to content

Instantly share code, notes, and snippets.

@alexlamazing
Created May 18, 2017 09:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexlamazing/399bf72330f5c52c63776d0a10aece86 to your computer and use it in GitHub Desktop.
Save alexlamazing/399bf72330f5c52c63776d0a10aece86 to your computer and use it in GitHub Desktop.
# import urllib2 library 用作 query 網頁
import urllib2
# import Beautiful soup 用作 parse 回傳頁面的 html
from bs4 import BeautifulSoup
#import pandas to convert list to data frame
import pandas as pd
import csv
for pageNo in range(1,1103):
locatefamily = "http://www.locatefamily.com/Street-Lists/Hong-Kong/index-" + str(pageNo) + ".html"
page = urllib2.urlopen(locatefamily)
soup = BeautifulSoup(page, "html.parser")
right_tables=soup.findAll('table', class_='table')
#Generate lists
A=[]
B=[]
C=[]
D=[]
E=[]
for table in right_tables:
for body in table.findAll("tbody"):
for row in body.findAll("tr"):
cells = row.findAll('td')
people = row.findAll('th')
if len(cells)==3:
A.append(pageNo)
B.append(people[0].find(text=True))
C.append(cells[0])
D.append(cells[1].find(text=True))
E.append(cells[2].find(text=True))
df=pd.DataFrame(A,columns=['PageNo'])
df['No']=B
df['Name']=C
df['Address']=D
df['Telephone']=E
pageNum = '{:04d}'.format(pageNo)
df.to_csv('page'+str(pageNum)+'.csv', sep=',', encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment