Skip to content

Instantly share code, notes, and snippets.

@yono
Created May 1, 2010 14:35
Show Gist options
  • Save yono/386379 to your computer and use it in GitHub Desktop.
Save yono/386379 to your computer and use it in GitHub Desktop.
twilog からデータ収集
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from datetime import date
import datetime
import time
from mechanize import Browser
from BeautifulSoup import BeautifulSoup, NavigableString, Declaration, Comment
def getNavigableStrings(soup):
if isinstance(soup, NavigableString):
if type(soup) not in (Comment, Declaration) and soup.strip():
yield soup
elif soup.name not in ('script', 'style'):
for c in soup.contents:
for g in getNavigableStrings(c):
yield g
def get_tweets(url):
br = Browser()
br.open(url)
res = br.response()
body = unicode(res.read(), br.encoding(), 'ignore')
soup = BeautifulSoup(body)
tweets = soup.findAll('p',{"class":"tl-text"})
result = []
for tweet in tweets:
result.append(''.join(getNavigableStrings(tweet)))
return result
def loop_date():
base_url = 'http://twilog.org/yono/date-'
d = date(2010,3,8)
current_date = "100501"
while True:
year = str(d.year)[2:4]
month = str(d.month)
if len(month) == 1:
month = "0%s" % (month)
day = str(d.day)
if len(day) == 1:
day = "0%s" % (day)
formatted_date = "%s%s%s" % (year,month,day)
if formatted_date == current_date:
break
else:
tweets = get_tweets(base_url+formatted_date)
file = open("tweets/%s.txt" % (formatted_date), 'w')
file.write('\n'.join(tweets))
file.close()
print "GET %s" % (formatted_date)
time.sleep(5)
d = d + datetime.timedelta(days=1)
if __name__ == '__main__':
#get_tweets('http://twilog.org/yono/date-100331')
loop_date()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment