Created
April 25, 2018 16:16
-
-
Save pmichel31415/c475c0022dd0ebd5070b27a317bd64d3 to your computer and use it in GitHub Desktop.
Stats on ACL 2018 accepted papers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Get the data | |
wget -nv http://acl2018.org/conference/accepted-papers/index.html | |
# Keep the list of papers only | |
sed -i '/paper-title/!d' index.html | |
# Extract author lists to csv | |
sed 's:.*<span class="paper-authors">\([^<]*\)</span>.*:\1:;s:(, | and ):,:g' index.html | tr '[:upper:]' '[:lower:]' > authors.txt | |
# Author frequencies | |
sed 's/,/\n/g' authors.txt | sort | uniq -c | sort -n | cut -d" " -f7 | uniq -c | |
# First author frequencies | |
cut -d"," -f1 authors.txt | sort | uniq -c | sort -n | cut -d" " -f7 | uniq -c | |
# Extract titles | |
sed 's:.*<span class="paper-title">\([^<]*\)</span>.*:\1:' index.html > titles.txt | |
# Word frequencies (stopwords are from NLTK) | |
cat titles.txt | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9 ]//g;s/ /\n/g' | sort | uniq -c | sed -r '/ i$| me$| my$| myself$| we$| our$| ours$| ourselves$| you$| your$| yours$| yourself$| yourselves$| he$| him$| his$| himself$| she$| her$| hers$| heself$| it$| its$| itself$| they$| them$| their$| theirs$| themselves$| what$| which$| who$| whom$| this$| that$| these$| those$| am$| is$| are$| was$| were$| be$| been$| being$| have$| has$| had$| having$| do$| does$| did$| doing$| a$| an$| the$| and$| but$| if$| or$| because$| as$| until$| while$| of$| at$| by$| for$| with$| about$| against$| between$| into$| through$| during$| before$| after$| above$| below$| to$| from$| up$| down$| in$| out$| on$| off$| over$| under$| again$| further$| then$| once$| here$| there$| when$| where$| why$| how$| all$| any$| both$| each$| few$| more$| most$| other$| some$| such$| no$| nor$| not$| only$| own$| same$| so$| than$| too$| very$| s$| t$| can$| will$| just$| don$| should$| now$/d' | sort -n | tail -n15 | |
# I used Excel for the plots |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment