Skip to content

Instantly share code, notes, and snippets.

@liuderchi
Created July 7, 2016 06:24
Show Gist options
  • Save liuderchi/fbd03dd5158249c41219bf94823ecbea to your computer and use it in GitHub Desktop.
Save liuderchi/fbd03dd5158249c41219bf94823ecbea to your computer and use it in GitHub Desktop.
# coding=utf8
import urllib
import re
from bs4 import BeautifulSoup as BS
url = 'http://www.dgpa.gov.tw/'
html_content = urllib.urlopen(url).read()
soup_obj = BS(html_content, 'html.parser')
texts = []
for string in soup_obj.stripped_strings: # get all string from page
texts.append(string.encode('utf8'))
for text in texts:
if re.search(r'^105', text):
print 'title: ', text
if re.search(u'臺北市', text.decode('utf8')):
print 'city: ', text
info = texts[texts.index(text)+1] # get next line
print 'info: ', info
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment