Skip to content

Instantly share code, notes, and snippets.

@bcbwilla
Last active December 18, 2015 05:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bcbwilla/5733723 to your computer and use it in GitHub Desktop.
Save bcbwilla/5733723 to your computer and use it in GitHub Desktop.
Scrapes interviews from the blog 'the chrome ball incident' (http://chromeballincident.blogspot.com) and then analyzes the word occurrence frequency. Output is then fed into wordle to make a word cloud (http://www.wordle.net/)
"""Scrapes interviews from the blog 'the chrome ball incident' for text analysis."""
import urllib
import string
from bs4 import BeautifulSoup
# url of first page of interviews
url = 'http://chromeballincident.blogspot.com/search/label/chrome%20ball%20interview'
interviews = 0
text = ''
# get the interview text
for i in range(20):
soup = BeautifulSoup(urllib.urlopen(url).read(),"html.parser")
# get posts from page
posts = soup.find_all('div',{'class':'post-outer'})
# get text from posts and remove newlines
t = "".join([p.get_text().encode('ascii','ignore') for p in posts]).replace('\n',' ')
# remove all punctuation
t = t.translate(string.maketrans("",""), string.punctuation)
text += t
# get url of next page of interviews
next_a = soup.find('a',{'class': 'blog-pager-older-link'})
if next_a:
url = next_a.get('href')
else:
break
# write to file for later analysis
f = open('raw.txt','w')
f.write(text)
f.close()
"""Analyzes text for word occurrence frequency and filters out common stop words. """
from collections import defaultdict
import operator
# words to exclude. source: http://www.textfixer.com/resources/common-english-words.php
COMMON_WORDS = """'tis,'twas,a,as,able,about,across,after,ain't,all,almost,also,am,among,an,and,any,are,aren't,
as,at,be,because,been,but,by,can,can't,cannot,could,could've,couldn't,dear,did,didn't,do,does,doesn't,don't,
either,else,ever,every,for,from,get,got,had,has,hasn't,have,he,he'd,he'll,he's,her,hers,him,his,how,how'd,
how'll,how's,however,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,just,least,let,like,likely,may,me,
might,might've,mightn't,most,must,must've,mustn't,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,
rather,said,say,says,shan't,she,she'd,she'll,she's,should,should've,shouldn't,since,so,some,than,that,that'll,
that's,the,their,them,then,there,there's,these,they,they'd,they'll,they're,they've,this,tis,to,too,twas,us,
wants,was,wasn't,we,we'd,we'll,we're,were,weren't,what,what'd,what's,when,when,when'd,when'll,when's,where,
where'd,where'll,where's,which,while,who,who'd,who'll,who's,whom,why,why'd,why'll,why's,will,with,won't,would,
would've,wouldn't,yet,you,you'd,you'll,you're,you've,your,ball,chrome,interview"""
COMMON_WORDS = set(COMMON_WORDS.replace("'",'').split(','))
# more words to exclude, if you want to be even more restrictive. source: http://www.ranks.nl/resources/stopwords.html
COMMON_WORDS2 = """a,able,about,above,abst,accordance,according,accordingly,across,act,actually,added,adj,affected,
affecting,affects,after,afterwards,again,against,ah,all,almost,alone,along,already,also,although,always,am,among,
amongst,an,and,announce,another,any,anybody,anyhow,anymore,anyone,anything,anyway,anyways,anywhere,apparently,
approximately,are,aren,arent,arise,around,as,aside,ask,asking,at,auth,available,away,awfully,b,back,be,became,
because,become,becomes,becoming,been,before,beforehand,begin,beginning,beginnings,begins,behind,being,believe,
below,beside,besides,between,beyond,biol,both,brief,briefly,but,by,c,ca,came,can,cannot,cant,cause,causes,certain,
certainly,co,com,come,comes,contain,containing,contains,could,couldnt,d,date,did,didnt,different,do,does,doesnt,
doing,done,dont,down,downwards,due,during,e,each,ed,edu,effect,eg,eight,eighty,either,else,elsewhere,end,ending,
enough,especially,et,et-al,etc,even,ever,every,everybody,everyone,everything,everywhere,ex,except,f,far,few,ff,
fifth,first,five,fix,followed,following,follows,for,former,formerly,forth,found,four,from,further,furthermore,g,
gave,get,gets,getting,give,given,gives,giving,go,goes,gone,got,gotten,h,had,happens,hardly,has,hasnt,have,havent,
having,he,hed,hence,her,here,hereafter,hereby,herein,heres,hereupon,hers,herself,hes,hi,hid,him,himself,his,hither,
home,how,howbeit,however,hundred,i,id,ie,if,ill,im,immediate,immediately,importance,important,in,inc,indeed,index,
information,instead,into,invention,inward,is,isnt,it,itd,itll,its,itself,ive,j,just,k,keep,keeps,kept,kg,km,know,
known,knows,l,largely,last,lately,later,latter,latterly,least,less,lest,let,lets,like,liked,likely,line,little,ll,
look,looking,looks,ltd,m,made,mainly,make,makes,many,may,maybe,me,mean,means,meantime,meanwhile,merely,mg,might,
million,miss,ml,more,moreover,most,mostly,mr,mrs,much,mug,must,my,myself,n,na,name,namely,nay,nd,near,nearly,
necessarily,necessary,need,needs,neither,never,nevertheless,new,next,nine,ninety,no,nobody,non,none,nonetheless,
noone,nor,normally,nos,not,noted,nothing,now,nowhere,o,obtain,obtained,obviously,of,off,often,oh,ok,okay,old,
omitted,on,once,one,ones,only,onto,or,ord,other,others,otherwise,ought,our,ours,ourselves,out,outside,over,overall,
owing,own,p,page,pages,part,particular,particularly,past,per,perhaps,placed,please,plus,poorly,possible,possibly,
potentially,pp,predominantly,present,previously,primarily,probably,promptly,proud,provides,put,q,que,quickly,quite,
qv,r,ran,rather,rd,re,readily,really,recent,recently,ref,refs,regarding,regardless,regards,related,relatively,research,
respectively,resulted,resulting,results,right,run,s,said,same,saw,say,saying,says,sec,section,see,seeing,seem,seemed,
seeming,seems,seen,self,selves,sent,seven,several,shall,she,shed,shell,shes,should,shouldnt,show,showed,shown,showns,
shows,significant,significantly,similar,similarly,since,six,slightly,so,some,somebody,somehow,someone,somethan,something,
sometime,sometimes,somewhat,somewhere,soon,sorry,specifically,specified,specify,specifying,still,stop,strongly,sub,
substantially,successfully,such,sufficiently,suggest,sup,sure,t,take,taken,taking,tell,tends,th,than,thank,thanks,thanx,
that,thatll,thats,thatve,the,their,theirs,them,themselves,then,thence,there,thereafter,thereby,thered,therefore,therein,
therell,thereof,therere,theres,thereto,thereupon,thereve,these,they,theyd,theyll,theyre,theyve,think,this,those,thou,
though,thoughh,thousand,throug,through,throughout,thru,thus,til,tip,to,together,too,took,toward,towards,tried,tries,truly,
try,trying,ts,twice,two,u,un,under,unfortunately,unless,unlike,unlikely,until,unto,up,upon,ups,us,use,used,useful,
usefully,usefulness,uses,using,usually,v,value,various,ve,very,via,viz,vol,vols,vs,w,want,wants,was,wasnt,way,we,wed,
welcome,well,went,were,werent,weve,what,whatever,whatll,whats,when,whence,whenever,where,whereafter,whereas,whereby,
wherein,wheres,whereupon,wherever,whether,which,while,whim,whither,who,whod,whoever,whole,wholl,whom,whomever,whos,whose,
why,widely,willing,wish,with,within,without,wont,words,world,would,wouldve,wouldnt,www,x,y,yes,yet,you,youd,youll,your,
youre,yours,yourself,yourselves,youve,z,zero"""
COMMON_WORDS2 = set(COMMON_WORDS2.replace("'",'').split(',')).union(COMMON_WORDS)
# open text file
f = open('raw.txt','r')
text = f.read()
f.close()
# analyize text for frequency of word occurance
d = defaultdict(int)
for word in text.split(" "):
word = word.lower()
if word not in COMMON_WORDS and word != '':
d[word] += 1
# sort by frequency and organize
freq = sorted(d.iteritems(), key = operator.itemgetter(1), reverse=True)
# write to output file
f = open('all.txt','w')
for item in freq:
if item[1] >= 10:
f.write(str(item[0]) + ': ' + str(item[1])+'\n')
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment