liuderchi/typhoon_dayoff_crawler.py

## typhoon_dayoff_crawler.py
# coding=utf8
import urllib
import re
from bs4 import BeautifulSoup as BS

url = 'http://www.dgpa.gov.tw/'

html_content = urllib.urlopen(url).read()
soup_obj = BS(html_content, 'html.parser')

texts = []

for string in soup_obj.stripped_strings:  # get all string from page
    texts.append(string.encode('utf8'))

for text in texts:

    if re.search(r'^105', text):
        print 'title: ', text

    if re.search(u'臺北市', text.decode('utf8')):
        print 'city: ', text
        info = texts[texts.index(text)+1] # get next line
        print 'info: ', info
	# coding=utf8
	import urllib
	import re
	from bs4 import BeautifulSoup as BS

	url = 'http://www.dgpa.gov.tw/'

	html_content = urllib.urlopen(url).read()
	soup_obj = BS(html_content, 'html.parser')

	texts = []

	for string in soup_obj.stripped_strings: # get all string from page
	texts.append(string.encode('utf8'))

	for text in texts:

	if re.search(r'^105', text):
	print 'title: ', text

	if re.search(u'臺北市', text.decode('utf8')):
	print 'city: ', text
	info = texts[texts.index(text)+1] # get next line
	print 'info: ', info