Skip to content

Instantly share code, notes, and snippets.

@philrenaud
Created October 22, 2014 20:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save philrenaud/06e8daeaff8e27f5646f to your computer and use it in GitHub Desktop.
Save philrenaud/06e8daeaff8e27f5646f to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import urllib2
import json
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
with open('folks.json', 'r') as f:
folks = json.load(f)
html = urllib2.urlopen( folks[1] ).read()
soup = BeautifulSoup( html )
parsedFolks = []
for folk in folks:
logging.debug(folk)
html = urllib2.urlopen( folk ).read()
soup = BeautifulSoup( html )
parsedPerson = {}
if soup.find('li', {'class': 'ProfileNav-item--tweets'}):
parsedPerson['num_tweets'] = soup.find('li', {'class': 'ProfileNav-item--tweets'}).find('span', {'class':'ProfileNav-value'}).text
if soup.find('li', {'class': 'ProfileNav-item--following'}):
parsedPerson['following'] = soup.find('li', {'class': 'ProfileNav-item--following'}).find('span', {'class':'ProfileNav-value'}).text
if soup.find('li', {'class': 'ProfileNav-item--followers'}):
parsedPerson['followers'] = soup.find('li', {'class': 'ProfileNav-item--followers'}).find('span', {'class':'ProfileNav-value'}).text
if soup.find('li', {'class': 'ProfileNav-item--favorites'}):
parsedPerson['favorites'] = soup.find('li', {'class': 'ProfileNav-item--favorites'}).find('span', {'class':'ProfileNav-value'}).text
if soup.find('a', {'class': 'ProfileHeaderCard-nameLink'}):
parsedPerson['name'] = soup.find('a', {'class': 'ProfileHeaderCard-nameLink'}).text
if soup.find('a', {'class': 'ProfileHeaderCard-screennameLink'}):
parsedPerson['handle'] = soup.find('a', {'class': 'ProfileHeaderCard-screennameLink'}).text.replace('@','')
if soup.find('p', {'class': 'ProfileHeaderCard-bio'}):
parsedPerson['bio'] = soup.find('p', {'class': 'ProfileHeaderCard-bio'}).text
if soup.find('span', {'class': 'ProfileHeaderCard-joinDateText'}):
parsedPerson['joinDate'] = soup.find('span', {'class': 'ProfileHeaderCard-joinDateText'})['title']
if soup.find('span', {'class': 'ProfileHeaderCard-locationText'}):
parsedPerson['location'] = soup.find('span', {'class': 'ProfileHeaderCard-locationText'}).text
if soup.find('span', {'class': 'ProfileHeaderCard-urlText'}).find('a'):
parsedPerson['url'] = soup.find('span', {'class': 'ProfileHeaderCard-urlText'}).find('a')['title']
if soup.find('img', {'class': 'ProfileAvatar-image'}):
parsedPerson['avatar'] = soup.find('img', {'class': 'ProfileAvatar-image'})['src'].replace('400x400', 'mini')
parsedPerson['tweets'] = []
if soup.find_all('div', {'class': 'ProfileTweet'}):
for tweet in soup.find_all('div', {'class': 'ProfileTweet'}):
if 'is-pinned' not in tweet['class']:
theTweet = {}
if tweet.find('p', {'class': 'ProfileTweet-text'}):
theTweet['text'] = tweet.find('p', {'class': 'ProfileTweet-text'}).text
if tweet.find('span', {'class': 'js-short-timestamp'}):
if tweet.find('span', {'class': 'js-short-timestamp'}).has_attr('data-time'):
theTweet['timestamp'] = tweet.find('span', {'class': 'js-short-timestamp'})['data-time']
if tweet.find('span', {'class': 'ProfileTweet-screenname'}):
theTweet['author'] = tweet.find('span', {'class': 'ProfileTweet-screenname'}).text.lstrip().rstrip().replace('@','')
parsedPerson['tweets'].append(theTweet)
parsedFolks.append(parsedPerson)
with open('parsed_folks.json', 'w') as outfile:
json.dump(parsedFolks, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment