bcbwilla/skatescraper.py

## skatescraper.py
"""Scrapes interviews from the blog 'the chrome ball incident' for text analysis."""

import urllib
import string

from bs4 import BeautifulSoup

# url of first page of interviews
url = 'http://chromeballincident.blogspot.com/search/label/chrome%20ball%20interview'
interviews = 0
text = ''

# get the interview text
for i in range(20):
    soup = BeautifulSoup(urllib.urlopen(url).read(),"html.parser")
    # get posts from page
    posts = soup.find_all('div',{'class':'post-outer'})
    # get text from posts and remove newlines
    t = "".join([p.get_text().encode('ascii','ignore') for p in posts]).replace('\n',' ')
    # remove all punctuation
    t = t.translate(string.maketrans("",""), string.punctuation)
    text += t

    # get url of next page of interviews
    next_a = soup.find('a',{'class': 'blog-pager-older-link'})
    if next_a:
        url = next_a.get('href')
    else:
        break

# write to file for later analysis
f = open('raw.txt','w')
f.write(text)
f.close()

## skatescraperanalysis.py
"""Analyzes text for word occurrence frequency and filters out common stop words. """

from collections import defaultdict
import operator

# words to exclude. source: http://www.textfixer.com/resources/common-english-words.php
COMMON_WORDS = """'tis,'twas,a,as,able,about,across,after,ain't,all,almost,also,am,among,an,and,any,are,aren't,
as,at,be,because,been,but,by,can,can't,cannot,could,could've,couldn't,dear,did,didn't,do,does,doesn't,don't,
either,else,ever,every,for,from,get,got,had,has,hasn't,have,he,he'd,he'll,he's,her,hers,him,his,how,how'd,
how'll,how's,however,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,just,least,let,like,likely,may,me,
might,might've,mightn't,most,must,must've,mustn't,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,
rather,said,say,says,shan't,she,she'd,she'll,she's,should,should've,shouldn't,since,so,some,than,that,that'll,
that's,the,their,them,then,there,there's,these,they,they'd,they'll,they're,they've,this,tis,to,too,twas,us,
wants,was,wasn't,we,we'd,we'll,we're,were,weren't,what,what'd,what's,when,when,when'd,when'll,when's,where,
where'd,where'll,where's,which,while,who,who'd,who'll,who's,whom,why,why'd,why'll,why's,will,with,won't,would,
would've,wouldn't,yet,you,you'd,you'll,you're,you've,your,ball,chrome,interview"""

COMMON_WORDS = set(COMMON_WORDS.replace("'",'').split(','))

# more words to exclude, if you want to be even more restrictive. source: http://www.ranks.nl/resources/stopwords.html
COMMON_WORDS2 = """a,able,about,above,abst,accordance,according,accordingly,across,act,actually,added,adj,affected,
affecting,affects,after,afterwards,again,against,ah,all,almost,alone,along,already,also,although,always,am,among,
amongst,an,and,announce,another,any,anybody,anyhow,anymore,anyone,anything,anyway,anyways,anywhere,apparently,
approximately,are,aren,arent,arise,around,as,aside,ask,asking,at,auth,available,away,awfully,b,back,be,became,
because,become,becomes,becoming,been,before,beforehand,begin,beginning,beginnings,begins,behind,being,believe,
below,beside,besides,between,beyond,biol,both,brief,briefly,but,by,c,ca,came,can,cannot,cant,cause,causes,certain,
certainly,co,com,come,comes,contain,containing,contains,could,couldnt,d,date,did,didnt,different,do,does,doesnt,
doing,done,dont,down,downwards,due,during,e,each,ed,edu,effect,eg,eight,eighty,either,else,elsewhere,end,ending,
enough,especially,et,et-al,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,except,f,far,few,ff,
fifth,first,five,fix,followed,following,follows,for,former,formerly,forth,found,four,from,further,furthermore,g,
gave,get,gets,getting,give,given,gives,giving,go,goes,gone,got,gotten,h,had,happens,hardly,has,hasnt,have,havent,
having,he,hed,hence,her,here,hereafter,hereby,herein,heres,hereupon,hers,herself,hes,hi,hid,him,himself,his,hither,
home,how,howbeit,however,hundred,i,id,ie,if,ill,im,immediate,immediately,importance,important,in,inc,indeed,index,
information,instead,into,invention,inward,is,isnt,it,itd,itll,its,itself,ive,j,just,k,keep,keeps,kept,kg,km,know,
known,knows,l,largely,last,lately,later,latter,latterly,least,less,lest,let,lets,like,liked,likely,line,little,ll,
look,looking,looks,ltd,m,made,mainly,make,makes,many,may,maybe,me,mean,means,meantime,meanwhile,merely,mg,might,
million,miss,ml,more,moreover,most,mostly,mr,mrs,much,mug,must,my,myself,n,na,name,namely,nay,nd,near,nearly,
necessarily,necessary,need,needs,neither,never,nevertheless,new,next,nine,ninety,no,nobody,non,none,nonetheless,
noone,nor,normally,nos,not,noted,nothing,now,nowhere,o,obtain,obtained,obviously,of,off,often,oh,ok,okay,old,
omitted,on,once,one,ones,only,onto,or,ord,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,
owing,own,p,page,pages,part,particular,particularly,past,per,perhaps,placed,please,plus,poorly,possible,possibly,
potentially,pp,predominantly,present,previously,primarily,probably,promptly,proud,provides,put,q,que,quickly,quite,
qv,r,ran,rather,rd,re,readily,really,recent,recently,ref,refs,regarding,regardless,regards,related,relatively,research,
respectively,resulted,resulting,results,right,run,s,said,same,saw,say,saying,says,sec,section,see,seeing,seem,seemed,
seeming,seems,seen,self,selves,sent,seven,several,shall,she,shed,shell,shes,should,shouldnt,show,showed,shown,showns,
shows,significant,significantly,similar,similarly,since,six,slightly,so,some,somebody,somehow,someone,somethan,something,
sometime,sometimes,somewhat,somewhere,soon,sorry,specifically,specified,specify,specifying,still,stop,strongly,sub,
substantially,successfully,such,sufficiently,suggest,sup,sure,t,take,taken,taking,tell,tends,th,than,thank,thanks,thanx,
that,thatll,thats,thatve,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,thered,therefore,therein,
therell,thereof,therere,theres,thereto,thereupon,thereve,these,they,theyd,theyll,theyre,theyve,think,this,those,thou,
though,thoughh,thousand,throug,through,throughout,thru,thus,til,tip,to,together,too,took,toward,towards,tried,tries,truly,
try,trying,ts,twice,two,u,un,under,unfortunately,unless,unlike,unlikely,until,unto,up,upon,ups,us,use,used,useful,
usefully,usefulness,uses,using,usually,v,value,various,ve,very,via,viz,vol,vols,vs,w,want,wants,was,wasnt,way,we,wed,
welcome,well,went,were,werent,weve,what,whatever,whatll,whats,when,whence,whenever,where,whereafter,whereas,whereby,
wherein,wheres,whereupon,wherever,whether,which,while,whim,whither,who,whod,whoever,whole,wholl,whom,whomever,whos,whose,
why,widely,willing,wish,with,within,without,wont,words,world,would,wouldve,wouldnt,www,x,y,yes,yet,you,youd,youll,your,
youre,yours,yourself,yourselves,youve,z,zero"""

COMMON_WORDS2 = set(COMMON_WORDS2.replace("'",'').split(',')).union(COMMON_WORDS)

# open text file
f = open('raw.txt','r')
text = f.read()
f.close()

# analyize text for frequency of word occurance
d = defaultdict(int)
for word in text.split(" "):
    word = word.lower()
    if word not in COMMON_WORDS and word != '':
        d[word] += 1

# sort by frequency and organize
freq = sorted(d.iteritems(), key = operator.itemgetter(1), reverse=True)

# write to output file
f = open('all.txt','w')
for item in freq:
    if item[1] >= 10:
        f.write(str(item[0]) + ': ' + str(item[1])+'\n')
f.close()
	"""Scrapes interviews from the blog 'the chrome ball incident' for text analysis."""

	import urllib
	import string

	from bs4 import BeautifulSoup

	# url of first page of interviews
	url = 'http://chromeballincident.blogspot.com/search/label/chrome%20ball%20interview'
	interviews = 0
	text = ''

	# get the interview text
	for i in range(20):
	soup = BeautifulSoup(urllib.urlopen(url).read(),"html.parser")
	# get posts from page
	posts = soup.find_all('div',{'class':'post-outer'})
	# get text from posts and remove newlines
	t = "".join([p.get_text().encode('ascii','ignore') for p in posts]).replace('\n',' ')
	# remove all punctuation
	t = t.translate(string.maketrans("",""), string.punctuation)
	text += t

	# get url of next page of interviews
	next_a = soup.find('a',{'class': 'blog-pager-older-link'})
	if next_a:
	url = next_a.get('href')
	else:
	break

	# write to file for later analysis
	f = open('raw.txt','w')
	f.write(text)
	f.close()
	"""Analyzes text for word occurrence frequency and filters out common stop words. """

	from collections import defaultdict
	import operator

	# words to exclude. source: http://www.textfixer.com/resources/common-english-words.php
	COMMON_WORDS = """'tis,'twas,a,as,able,about,across,after,ain't,all,almost,also,am,among,an,and,any,are,aren't,
	as,at,be,because,been,but,by,can,can't,cannot,could,could've,couldn't,dear,did,didn't,do,does,doesn't,don't,
	either,else,ever,every,for,from,get,got,had,has,hasn't,have,he,he'd,he'll,he's,her,hers,him,his,how,how'd,
	how'll,how's,however,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,just,least,let,like,likely,may,me,
	might,might've,mightn't,most,must,must've,mustn't,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,
	rather,said,say,says,shan't,she,she'd,she'll,she's,should,should've,shouldn't,since,so,some,than,that,that'll,
	that's,the,their,them,then,there,there's,these,they,they'd,they'll,they're,they've,this,tis,to,too,twas,us,
	wants,was,wasn't,we,we'd,we'll,we're,were,weren't,what,what'd,what's,when,when,when'd,when'll,when's,where,
	where'd,where'll,where's,which,while,who,who'd,who'll,who's,whom,why,why'd,why'll,why's,will,with,won't,would,
	would've,wouldn't,yet,you,you'd,you'll,you're,you've,your,ball,chrome,interview"""

	COMMON_WORDS = set(COMMON_WORDS.replace("'",'').split(','))

	# more words to exclude, if you want to be even more restrictive. source: http://www.ranks.nl/resources/stopwords.html
	COMMON_WORDS2 = """a,able,about,above,abst,accordance,according,accordingly,across,act,actually,added,adj,affected,
	affecting,affects,after,afterwards,again,against,ah,all,almost,alone,along,already,also,although,always,am,among,
	amongst,an,and,announce,another,any,anybody,anyhow,anymore,anyone,anything,anyway,anyways,anywhere,apparently,
	approximately,are,aren,arent,arise,around,as,aside,ask,asking,at,auth,available,away,awfully,b,back,be,became,
	because,become,becomes,becoming,been,before,beforehand,begin,beginning,beginnings,begins,behind,being,believe,
	below,beside,besides,between,beyond,biol,both,brief,briefly,but,by,c,ca,came,can,cannot,cant,cause,causes,certain,
	certainly,co,com,come,comes,contain,containing,contains,could,couldnt,d,date,did,didnt,different,do,does,doesnt,
	doing,done,dont,down,downwards,due,during,e,each,ed,edu,effect,eg,eight,eighty,either,else,elsewhere,end,ending,
	enough,especially,et,et-al,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,except,f,far,few,ff,
	fifth,first,five,fix,followed,following,follows,for,former,formerly,forth,found,four,from,further,furthermore,g,
	gave,get,gets,getting,give,given,gives,giving,go,goes,gone,got,gotten,h,had,happens,hardly,has,hasnt,have,havent,
	having,he,hed,hence,her,here,hereafter,hereby,herein,heres,hereupon,hers,herself,hes,hi,hid,him,himself,his,hither,
	home,how,howbeit,however,hundred,i,id,ie,if,ill,im,immediate,immediately,importance,important,in,inc,indeed,index,
	information,instead,into,invention,inward,is,isnt,it,itd,itll,its,itself,ive,j,just,k,keep,keeps,kept,kg,km,know,
	known,knows,l,largely,last,lately,later,latter,latterly,least,less,lest,let,lets,like,liked,likely,line,little,ll,
	look,looking,looks,ltd,m,made,mainly,make,makes,many,may,maybe,me,mean,means,meantime,meanwhile,merely,mg,might,
	million,miss,ml,more,moreover,most,mostly,mr,mrs,much,mug,must,my,myself,n,na,name,namely,nay,nd,near,nearly,
	necessarily,necessary,need,needs,neither,never,nevertheless,new,next,nine,ninety,no,nobody,non,none,nonetheless,
	noone,nor,normally,nos,not,noted,nothing,now,nowhere,o,obtain,obtained,obviously,of,off,often,oh,ok,okay,old,
	omitted,on,once,one,ones,only,onto,or,ord,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,
	owing,own,p,page,pages,part,particular,particularly,past,per,perhaps,placed,please,plus,poorly,possible,possibly,
	potentially,pp,predominantly,present,previously,primarily,probably,promptly,proud,provides,put,q,que,quickly,quite,
	qv,r,ran,rather,rd,re,readily,really,recent,recently,ref,refs,regarding,regardless,regards,related,relatively,research,
	respectively,resulted,resulting,results,right,run,s,said,same,saw,say,saying,says,sec,section,see,seeing,seem,seemed,
	seeming,seems,seen,self,selves,sent,seven,several,shall,she,shed,shell,shes,should,shouldnt,show,showed,shown,showns,
	shows,significant,significantly,similar,similarly,since,six,slightly,so,some,somebody,somehow,someone,somethan,something,
	sometime,sometimes,somewhat,somewhere,soon,sorry,specifically,specified,specify,specifying,still,stop,strongly,sub,
	substantially,successfully,such,sufficiently,suggest,sup,sure,t,take,taken,taking,tell,tends,th,than,thank,thanks,thanx,
	that,thatll,thats,thatve,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,thered,therefore,therein,
	therell,thereof,therere,theres,thereto,thereupon,thereve,these,they,theyd,theyll,theyre,theyve,think,this,those,thou,
	though,thoughh,thousand,throug,through,throughout,thru,thus,til,tip,to,together,too,took,toward,towards,tried,tries,truly,
	try,trying,ts,twice,two,u,un,under,unfortunately,unless,unlike,unlikely,until,unto,up,upon,ups,us,use,used,useful,
	usefully,usefulness,uses,using,usually,v,value,various,ve,very,via,viz,vol,vols,vs,w,want,wants,was,wasnt,way,we,wed,
	welcome,well,went,were,werent,weve,what,whatever,whatll,whats,when,whence,whenever,where,whereafter,whereas,whereby,
	wherein,wheres,whereupon,wherever,whether,which,while,whim,whither,who,whod,whoever,whole,wholl,whom,whomever,whos,whose,
	why,widely,willing,wish,with,within,without,wont,words,world,would,wouldve,wouldnt,www,x,y,yes,yet,you,youd,youll,your,
	youre,yours,yourself,yourselves,youve,z,zero"""

	COMMON_WORDS2 = set(COMMON_WORDS2.replace("'",'').split(',')).union(COMMON_WORDS)

	# open text file
	f = open('raw.txt','r')
	text = f.read()
	f.close()

	# analyize text for frequency of word occurance
	d = defaultdict(int)
	for word in text.split(" "):
	word = word.lower()
	if word not in COMMON_WORDS and word != '':
	d[word] += 1

	# sort by frequency and organize
	freq = sorted(d.iteritems(), key = operator.itemgetter(1), reverse=True)

	# write to output file
	f = open('all.txt','w')
	for item in freq:
	if item[1] >= 10:
	f.write(str(item[0]) + ': ' + str(item[1])+'\n')
	f.close()