Skip to content

Instantly share code, notes, and snippets.

@nandajavarma
Last active August 29, 2015 13:57
Show Gist options
  • Save nandajavarma/9868209 to your computer and use it in GitHub Desktop.
Save nandajavarma/9868209 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import urllib2
import re
import csv
from bs4 import BeautifulSoup
import sys
from sys import argv
def getlinks(csvfile):
links = []
with open(csvfile, 'rb') as f:
reader = csv.reader(f)
for row in reader:
if row:
links.append(row[1].replace(' ', '_').split('\n'))
return links
def create_csv(counts):
with open("csvfile.csv", "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerow(["character_count"])
for val in counts:
writer.writerow([val])
return
def getcontent(links):
urlhead= 'http://ml.wikisource.org/wiki/'
links.pop(0)
counts = []
for title in links:
url = urlhead + title[0]
print "counting characters in " + url
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
text_parts = str(soup.find("div", {"class": "pagetext"})).decode("UTF-8")
text = re.sub('<[^>]*>', '', text_parts).replace(' ', '')
counts.append(len(text))
return counts
def getoutput():
readerop = csv.reader(open('thal_creation.csv', 'rb'))
reader1 = csv.reader(open('csvfile.csv', 'rb'))
writer1 = csv.writer(open('charactercount_output.csv', 'wb'))
for row in readerop:
try:
row1 = reader1.next()
writer.writerow(row + row1)
except StopIteration:
pass
return
if __name__=='__main__':
if len(argv) < 2:
print "USAGE: python parsecsv.py <csvfilename>"
sys.exit()
links = getlinks(argv[1])
counts = getcontent(links)
create_csv(counts)
getoutput()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment