Last active
December 18, 2015 05:38
-
-
Save bcbwilla/5733723 to your computer and use it in GitHub Desktop.
Scrapes interviews from the blog 'the chrome ball incident' (http://chromeballincident.blogspot.com) and then analyzes the word occurrence frequency. Output is then fed into wordle to make a word cloud (http://www.wordle.net/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Scrapes interviews from the blog 'the chrome ball incident' for text analysis.""" | |
import urllib | |
import string | |
from bs4 import BeautifulSoup | |
# url of first page of interviews | |
url = 'http://chromeballincident.blogspot.com/search/label/chrome%20ball%20interview' | |
interviews = 0 | |
text = '' | |
# get the interview text | |
for i in range(20): | |
soup = BeautifulSoup(urllib.urlopen(url).read(),"html.parser") | |
# get posts from page | |
posts = soup.find_all('div',{'class':'post-outer'}) | |
# get text from posts and remove newlines | |
t = "".join([p.get_text().encode('ascii','ignore') for p in posts]).replace('\n',' ') | |
# remove all punctuation | |
t = t.translate(string.maketrans("",""), string.punctuation) | |
text += t | |
# get url of next page of interviews | |
next_a = soup.find('a',{'class': 'blog-pager-older-link'}) | |
if next_a: | |
url = next_a.get('href') | |
else: | |
break | |
# write to file for later analysis | |
f = open('raw.txt','w') | |
f.write(text) | |
f.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Analyzes text for word occurrence frequency and filters out common stop words. """ | |
from collections import defaultdict | |
import operator | |
# words to exclude. source: http://www.textfixer.com/resources/common-english-words.php | |
COMMON_WORDS = """'tis,'twas,a,as,able,about,across,after,ain't,all,almost,also,am,among,an,and,any,are,aren't, | |
as,at,be,because,been,but,by,can,can't,cannot,could,could've,couldn't,dear,did,didn't,do,does,doesn't,don't, | |
either,else,ever,every,for,from,get,got,had,has,hasn't,have,he,he'd,he'll,he's,her,hers,him,his,how,how'd, | |
how'll,how's,however,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,just,least,let,like,likely,may,me, | |
might,might've,mightn't,most,must,must've,mustn't,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own, | |
rather,said,say,says,shan't,she,she'd,she'll,she's,should,should've,shouldn't,since,so,some,than,that,that'll, | |
that's,the,their,them,then,there,there's,these,they,they'd,they'll,they're,they've,this,tis,to,too,twas,us, | |
wants,was,wasn't,we,we'd,we'll,we're,were,weren't,what,what'd,what's,when,when,when'd,when'll,when's,where, | |
where'd,where'll,where's,which,while,who,who'd,who'll,who's,whom,why,why'd,why'll,why's,will,with,won't,would, | |
would've,wouldn't,yet,you,you'd,you'll,you're,you've,your,ball,chrome,interview""" | |
COMMON_WORDS = set(COMMON_WORDS.replace("'",'').split(',')) | |
# more words to exclude, if you want to be even more restrictive. source: http://www.ranks.nl/resources/stopwords.html | |
COMMON_WORDS2 = """a,able,about,above,abst,accordance,according,accordingly,across,act,actually,added,adj,affected, | |
affecting,affects,after,afterwards,again,against,ah,all,almost,alone,along,already,also,although,always,am,among, | |
amongst,an,and,announce,another,any,anybody,anyhow,anymore,anyone,anything,anyway,anyways,anywhere,apparently, | |
approximately,are,aren,arent,arise,around,as,aside,ask,asking,at,auth,available,away,awfully,b,back,be,became, | |
because,become,becomes,becoming,been,before,beforehand,begin,beginning,beginnings,begins,behind,being,believe, | |
below,beside,besides,between,beyond,biol,both,brief,briefly,but,by,c,ca,came,can,cannot,cant,cause,causes,certain, | |
certainly,co,com,come,comes,contain,containing,contains,could,couldnt,d,date,did,didnt,different,do,does,doesnt, | |
doing,done,dont,down,downwards,due,during,e,each,ed,edu,effect,eg,eight,eighty,either,else,elsewhere,end,ending, | |
enough,especially,et,et-al,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,except,f,far,few,ff, | |
fifth,first,five,fix,followed,following,follows,for,former,formerly,forth,found,four,from,further,furthermore,g, | |
gave,get,gets,getting,give,given,gives,giving,go,goes,gone,got,gotten,h,had,happens,hardly,has,hasnt,have,havent, | |
having,he,hed,hence,her,here,hereafter,hereby,herein,heres,hereupon,hers,herself,hes,hi,hid,him,himself,his,hither, | |
home,how,howbeit,however,hundred,i,id,ie,if,ill,im,immediate,immediately,importance,important,in,inc,indeed,index, | |
information,instead,into,invention,inward,is,isnt,it,itd,itll,its,itself,ive,j,just,k,keep,keeps,kept,kg,km,know, | |
known,knows,l,largely,last,lately,later,latter,latterly,least,less,lest,let,lets,like,liked,likely,line,little,ll, | |
look,looking,looks,ltd,m,made,mainly,make,makes,many,may,maybe,me,mean,means,meantime,meanwhile,merely,mg,might, | |
million,miss,ml,more,moreover,most,mostly,mr,mrs,much,mug,must,my,myself,n,na,name,namely,nay,nd,near,nearly, | |
necessarily,necessary,need,needs,neither,never,nevertheless,new,next,nine,ninety,no,nobody,non,none,nonetheless, | |
noone,nor,normally,nos,not,noted,nothing,now,nowhere,o,obtain,obtained,obviously,of,off,often,oh,ok,okay,old, | |
omitted,on,once,one,ones,only,onto,or,ord,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall, | |
owing,own,p,page,pages,part,particular,particularly,past,per,perhaps,placed,please,plus,poorly,possible,possibly, | |
potentially,pp,predominantly,present,previously,primarily,probably,promptly,proud,provides,put,q,que,quickly,quite, | |
qv,r,ran,rather,rd,re,readily,really,recent,recently,ref,refs,regarding,regardless,regards,related,relatively,research, | |
respectively,resulted,resulting,results,right,run,s,said,same,saw,say,saying,says,sec,section,see,seeing,seem,seemed, | |
seeming,seems,seen,self,selves,sent,seven,several,shall,she,shed,shell,shes,should,shouldnt,show,showed,shown,showns, | |
shows,significant,significantly,similar,similarly,since,six,slightly,so,some,somebody,somehow,someone,somethan,something, | |
sometime,sometimes,somewhat,somewhere,soon,sorry,specifically,specified,specify,specifying,still,stop,strongly,sub, | |
substantially,successfully,such,sufficiently,suggest,sup,sure,t,take,taken,taking,tell,tends,th,than,thank,thanks,thanx, | |
that,thatll,thats,thatve,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,thered,therefore,therein, | |
therell,thereof,therere,theres,thereto,thereupon,thereve,these,they,theyd,theyll,theyre,theyve,think,this,those,thou, | |
though,thoughh,thousand,throug,through,throughout,thru,thus,til,tip,to,together,too,took,toward,towards,tried,tries,truly, | |
try,trying,ts,twice,two,u,un,under,unfortunately,unless,unlike,unlikely,until,unto,up,upon,ups,us,use,used,useful, | |
usefully,usefulness,uses,using,usually,v,value,various,ve,very,via,viz,vol,vols,vs,w,want,wants,was,wasnt,way,we,wed, | |
welcome,well,went,were,werent,weve,what,whatever,whatll,whats,when,whence,whenever,where,whereafter,whereas,whereby, | |
wherein,wheres,whereupon,wherever,whether,which,while,whim,whither,who,whod,whoever,whole,wholl,whom,whomever,whos,whose, | |
why,widely,willing,wish,with,within,without,wont,words,world,would,wouldve,wouldnt,www,x,y,yes,yet,you,youd,youll,your, | |
youre,yours,yourself,yourselves,youve,z,zero""" | |
COMMON_WORDS2 = set(COMMON_WORDS2.replace("'",'').split(',')).union(COMMON_WORDS) | |
# open text file | |
f = open('raw.txt','r') | |
text = f.read() | |
f.close() | |
# analyize text for frequency of word occurance | |
d = defaultdict(int) | |
for word in text.split(" "): | |
word = word.lower() | |
if word not in COMMON_WORDS and word != '': | |
d[word] += 1 | |
# sort by frequency and organize | |
freq = sorted(d.iteritems(), key = operator.itemgetter(1), reverse=True) | |
# write to output file | |
f = open('all.txt','w') | |
for item in freq: | |
if item[1] >= 10: | |
f.write(str(item[0]) + ': ' + str(item[1])+'\n') | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment