Skip to content

Instantly share code, notes, and snippets.

@luisdaniel
Created April 3, 2013 00:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save luisdaniel/5297598 to your computer and use it in GitHub Desktop.
Save luisdaniel/5297598 to your computer and use it in GitHub Desktop.
from BeautifulSoup import BeautifulSoup
import re
import urllib2
import json
import csv
csvreader = csv.reader(open('narcoBlogLinks.csv', 'rb'), delimiter=',')
csv = list(csvreader)
with open('data.json', 'w') as outfile:
for i in range(1, len(csv)):
url = csv[i][2]
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
title = soup.find(['h2']).contents[0]
date = soup.find(attrs={"class": "date"}).contents[0]
text = ''
for node in soup.findAll('p'):
text += ''.join(node.findAll(text=True))
data = [url, title, date, text]
data = {
"url": url,
"title": title,
"date": date,
"text": text
}
print data['date'] #prints with correct encoding.
json.dump(data, outfile) #prints with wrong encoding.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment