Last active
October 23, 2018 22:15
-
-
Save allisonmorgan/aa29935affdf2b2392c9da56d74f96a6 to your computer and use it in GitHub Desktop.
Counting punctuation in DBLP titles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1959 2 80 | |
1960 0 9 | |
1961 0 26 | |
1962 7 147 | |
1963 1 13 | |
1964 4 42 | |
1965 4 75 | |
1966 20 157 | |
1967 37 279 | |
1968 65 549 | |
1969 58 456 | |
1970 43 306 | |
1971 73 631 | |
1972 71 631 | |
1973 106 1007 | |
1974 177 1359 | |
1975 171 1311 | |
1976 272 1639 | |
1977 234 1674 | |
1978 257 1932 | |
1979 319 2156 | |
1980 417 2496 | |
1981 527 2870 | |
1982 580 3223 | |
1983 645 3595 | |
1984 865 4531 | |
1985 806 4490 | |
1986 1176 6139 | |
1987 1055 6523 | |
1988 1402 8250 | |
1989 1653 9820 | |
1990 1860 11719 | |
1991 2272 12637 | |
1992 2404 14395 | |
1993 3075 18354 | |
1994 3843 21959 | |
1995 3853 21587 | |
1996 4009 23786 | |
1997 4899 26848 | |
1998 5535 31607 | |
1999 6668 35135 | |
2000 7815 39933 | |
2001 8589 42756 | |
2002 10529 49407 | |
2003 12651 60270 | |
2004 16443 76917 | |
2005 18539 88419 | |
2006 21628 97463 | |
2007 22791 106399 | |
2008 24231 110326 | |
2009 24679 116249 | |
2010 25779 121596 | |
2011 28523 132934 | |
2012 30868 139126 | |
2013 30765 142647 | |
2014 31439 144993 | |
2015 32187 147075 | |
2016 32574 147512 | |
2017 32049 145898 | |
2018 16562 79072 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1959 0 80 | |
1960 0 9 | |
1961 0 26 | |
1962 2 147 | |
1963 1 13 | |
1964 3 42 | |
1965 3 75 | |
1966 15 157 | |
1967 35 279 | |
1968 61 549 | |
1969 64 456 | |
1970 47 306 | |
1971 81 631 | |
1972 103 631 | |
1973 95 1007 | |
1974 125 1359 | |
1975 128 1311 | |
1976 154 1639 | |
1977 185 1674 | |
1978 212 1932 | |
1979 183 2156 | |
1980 274 2496 | |
1981 349 2870 | |
1982 380 3223 | |
1983 443 3595 | |
1984 527 4531 | |
1985 619 4490 | |
1986 870 6139 | |
1987 909 6523 | |
1988 1223 8250 | |
1989 1356 9820 | |
1990 1650 11719 | |
1991 1769 12637 | |
1992 2119 14395 | |
1993 2733 18354 | |
1994 3147 21959 | |
1995 3190 21587 | |
1996 3473 23786 | |
1997 4233 26848 | |
1998 4764 31607 | |
1999 5271 35135 | |
2000 5914 39933 | |
2001 6400 42756 | |
2002 7546 49407 | |
2003 9149 60270 | |
2004 11205 76917 | |
2005 12707 88419 | |
2006 13530 97463 | |
2007 15103 106399 | |
2008 15954 110326 | |
2009 17106 116249 | |
2010 18368 121596 | |
2011 20879 132934 | |
2012 22924 139126 | |
2013 24604 142647 | |
2014 26098 144993 | |
2015 27197 147075 | |
2016 28741 147512 | |
2017 29275 145898 | |
2018 18444 79072 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import plot_utils | |
import csv | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from scipy import stats | |
def get_wilson_score_interval(num_successes, num_failures, alpha=0.05): | |
""" returns the confidence interval for p, the success probability """ | |
z = stats.norm.ppf(q=1-alpha/2.) | |
n = num_successes + num_failures | |
ns = num_successes | |
nf = num_failures | |
upper = (ns + .5*z**2)/(n+z**2) + z/(n+z**2)*np.sqrt((ns*nf)/n + .25*z**2) | |
lower = (ns + .5*z**2)/(n+z**2) - z/(n+z**2)*np.sqrt((ns*nf)/n + .25*z**2) | |
return (lower, upper) | |
fig, ax = plt.subplots(figsize=(12, 6)) | |
files = ['colon_count.txt', 'hyphen_count.txt', 'question_count.txt', 'acronym_count.txt', "quote_count.txt"] #'acronym_colon_count.txt'] | |
labels = ['colon', 'hyphen', 'question', 'acronym', 'quote'] | |
for i, color in enumerate(['#539CAF', '#49586E', '#8A696E', '#B06B5D', '#BAA657']): #'#748BD1', | |
with open(files[i],'r') as tsv: | |
data = [line.strip().split('\t') for line in tsv] | |
x = []; y = []; lower_CI = []; upper_CI = []; | |
for row in data: | |
if int(row[0]) >= 1975: | |
x.append(int(row[0])) | |
y.append(float(row[1])/float(row[2])) | |
(low, high) = get_wilson_score_interval(int(row[1]), int(row[2]) - int(row[1])) | |
lower_CI.append(low); upper_CI.append(high); | |
ax.scatter(x, y, color=color, label = labels[i]) | |
ax.fill_between(x, lower_CI, upper_CI, color = color, alpha = 0.4) | |
ax.set_xticks(range(1975, 2020, 5)) | |
ax.legend(fontsize=plot_utils.LEGEND_SIZE, frameon=False, ncol=2) | |
ax.set_ylabel('Frequency') | |
ax.set_xlabel('Year') | |
ax.set_title('Relative Counts of Punctuation (Source: DBLP 1975--Present)') | |
plot_utils.finalize(ax) | |
plt.tight_layout() | |
plt.savefig('frequency.pdf', bbox_inches='tight', format='pdf', dpi=1000) | |
plt.clf() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1959 1 80 | |
1960 0 9 | |
1961 0 26 | |
1962 7 147 | |
1963 0 13 | |
1964 0 42 | |
1965 1 75 | |
1966 1 157 | |
1967 3 279 | |
1968 12 549 | |
1969 2 456 | |
1970 6 306 | |
1971 10 631 | |
1972 18 631 | |
1973 28 1007 | |
1974 56 1359 | |
1975 45 1311 | |
1976 60 1639 | |
1977 68 1674 | |
1978 78 1932 | |
1979 73 2156 | |
1980 108 2496 | |
1981 104 2870 | |
1982 108 3223 | |
1983 153 3595 | |
1984 183 4531 | |
1985 138 4490 | |
1986 258 6139 | |
1987 211 6523 | |
1988 256 8250 | |
1989 268 9820 | |
1990 287 11719 | |
1991 325 12637 | |
1992 367 14395 | |
1993 502 18354 | |
1994 546 21959 | |
1995 423 21587 | |
1996 486 23786 | |
1997 547 26848 | |
1998 736 31607 | |
1999 883 35135 | |
2000 842 39933 | |
2001 1003 42756 | |
2002 1074 49407 | |
2003 1273 60270 | |
2004 1693 76917 | |
2005 1821 88419 | |
2006 2152 97463 | |
2007 2242 106399 | |
2008 2560 110326 | |
2009 2676 116249 | |
2010 2942 121596 | |
2011 3514 132934 | |
2012 3375 139126 | |
2013 3462 142647 | |
2014 3565 144993 | |
2015 3450 147075 | |
2016 2950 147512 | |
2017 2846 145898 | |
2018 1639 79072 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"compress/gzip" | |
"encoding/xml" | |
"fmt" | |
"golang.org/x/net/html/charset" | |
"log" | |
"os" | |
"strings" | |
//"regexp" | |
"sort" | |
) | |
// Structs to unmarshal the DBLP XML data into | |
// Only unpacks "inproceedings" publications | |
type InProceedings struct { | |
Papers []Paper `xml:"inproceedings"` | |
} | |
// Common features of publications. Not a complete | |
// set of attributes. | |
type Paper struct { | |
Key string `xml:"key,attr"` | |
Authors []string `xml:"author"` | |
Title string `xml:"title"` | |
Year int `xml:"year"` | |
Booktitle string `xml:"booktitle"` | |
EE string `xml:"ee"` | |
Crossref string `xml:"crossref"` | |
Url string `xml:"url"` | |
} | |
// Since decoder.Decode() will error on special | |
// characters, the map below will convert these | |
// foreign language characters | |
var characters = []string{"a", "e", "o", "u", "i", "c", "b", "s", "y", "n", "ae"} | |
var suffixes = []string{"acute", "uml", "ring", "zlig", "slash", "cedil", "grave", "circ", "tilde", "lig"} | |
func build_special_character_list() map[string]string { | |
var special_characters = make(map[string]string) | |
for _, character := range characters { | |
for _, suffix := range suffixes { | |
special_characters[character+suffix] = character | |
special_characters[strings.ToUpper(character)+suffix] = character | |
} | |
} | |
// Other random special characters | |
special_characters["times"] = "*" | |
special_characters["reg"] = "reg" | |
special_characters["eth"] = "d" | |
special_characters["ETH"] = "d" | |
special_characters["micro"] = "u" | |
special_characters["thorn"] = "p" | |
special_characters["THORN"] = "p" | |
return special_characters | |
} | |
func main() { | |
// Download data from http://dblp.uni-trier.de/xml/, | |
// change the file path below to point to it | |
log.Println("Reading in DBLP gzipped file") | |
f, err := os.Open("dblp.xml.gz") | |
if err != nil { | |
log.Fatal(err) | |
} | |
defer f.Close() | |
// Golang can read straight from gzipped files! | |
gzf, err := gzip.NewReader(f) | |
if err != nil { | |
log.Fatal(err) | |
} | |
// Start a new XML decoder instance and ask it to | |
// find the encoding specified in the file's header. | |
// Specify the special character replacements. | |
decoder := xml.NewDecoder(gzf) | |
decoder.CharsetReader = charset.NewReaderLabel | |
decoder.Entity = build_special_character_list() | |
// This step takes a while (~2 minutes) | |
log.Println("Decoding all proceedings from file") | |
var papers InProceedings | |
err = decoder.Decode(&papers) | |
if err != nil { | |
log.Fatal(err) | |
} | |
log.Println("Done decoding") | |
// log.Printf("Example paper: %+v\n", papers.Papers[0]) | |
counter := make(map[int]int) | |
total := make(map[int]int) | |
for _, paper := range papers.Papers { | |
// matched, _ := regexp.MatchString(".*[A-Z]{3}.*", paper.Title) | |
// matched := strings.Contains(paper.Title, ":") | |
// matched := strings.Contains(paper.Title, " - ") | |
// matched := strings.Contains(paper.Title, "?") | |
matched := strings.Contains(paper.Title, "\"") | |
if matched { | |
counter[paper.Year] += 1 | |
} | |
total[paper.Year] += 1 | |
} | |
log.Println(counter) | |
log.Println(total) | |
// log.Printf("Number of %s titles: %v\tAverage title length: %v\n", strings.ToUpper(conference), len(titles), float64(avg_length)/float64(len(titles))) | |
// Write all the papers to a new text file | |
f, err = os.Create(fmt.Sprintf("quote_count.txt")) | |
if err != nil{ | |
log.Fatal(err) | |
} | |
defer f.Close() | |
var years []int | |
for k, _ := range total { | |
years = append(years, k) | |
} | |
sort.Ints(years) | |
log.Println(years) | |
for i := range years { | |
_, err := f.WriteString(fmt.Sprintf("%d\t%d\t%d\n", years[i], counter[years[i]], total[years[i]])) | |
if err != nil { | |
log.Fatal(err) | |
} | |
} | |
log.Println("Output data to quote_count.txt") | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1959 1 80 | |
1960 0 9 | |
1961 0 26 | |
1962 0 147 | |
1963 0 13 | |
1964 0 42 | |
1965 0 75 | |
1966 0 157 | |
1967 3 279 | |
1968 2 549 | |
1969 0 456 | |
1970 1 306 | |
1971 6 631 | |
1972 4 631 | |
1973 7 1007 | |
1974 9 1359 | |
1975 15 1311 | |
1976 11 1639 | |
1977 16 1674 | |
1978 25 1932 | |
1979 17 2156 | |
1980 36 2496 | |
1981 30 2870 | |
1982 29 3223 | |
1983 41 3595 | |
1984 56 4531 | |
1985 63 4490 | |
1986 85 6139 | |
1987 82 6523 | |
1988 107 8250 | |
1989 140 9820 | |
1990 130 11719 | |
1991 150 12637 | |
1992 184 14395 | |
1993 207 18354 | |
1994 296 21959 | |
1995 263 21587 | |
1996 286 23786 | |
1997 331 26848 | |
1998 377 31607 | |
1999 431 35135 | |
2000 472 39933 | |
2001 494 42756 | |
2002 645 49407 | |
2003 709 60270 | |
2004 839 76917 | |
2005 910 88419 | |
2006 936 97463 | |
2007 1050 106399 | |
2008 1223 110326 | |
2009 1327 116249 | |
2010 1330 121596 | |
2011 1587 132934 | |
2012 1745 139126 | |
2013 1974 142647 | |
2014 2032 144993 | |
2015 2166 147075 | |
2016 2223 147512 | |
2017 2340 145898 | |
2018 1363 79072 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1959 0 80 | |
1960 0 9 | |
1961 0 26 | |
1962 1 147 | |
1963 0 13 | |
1964 1 42 | |
1965 2 75 | |
1966 1 157 | |
1967 5 279 | |
1968 2 549 | |
1969 2 456 | |
1970 2 306 | |
1971 7 631 | |
1972 12 631 | |
1973 7 1007 | |
1974 11 1359 | |
1975 17 1311 | |
1976 6 1639 | |
1977 14 1674 | |
1978 29 1932 | |
1979 14 2156 | |
1980 32 2496 | |
1981 26 2870 | |
1982 36 3223 | |
1983 22 3595 | |
1984 25 4531 | |
1985 43 4490 | |
1986 46 6139 | |
1987 47 6523 | |
1988 46 8250 | |
1989 66 9820 | |
1990 58 11719 | |
1991 87 12637 | |
1992 91 14395 | |
1993 95 18354 | |
1994 132 21959 | |
1995 113 21587 | |
1996 115 23786 | |
1997 150 26848 | |
1998 162 31607 | |
1999 228 35135 | |
2000 164 39933 | |
2001 209 42756 | |
2002 237 49407 | |
2003 266 60270 | |
2004 351 76917 | |
2005 357 88419 | |
2006 362 97463 | |
2007 418 106399 | |
2008 482 110326 | |
2009 430 116249 | |
2010 471 121596 | |
2011 546 132934 | |
2012 552 139126 | |
2013 607 142647 | |
2014 602 144993 | |
2015 642 147075 | |
2016 672 147512 | |
2017 593 145898 | |
2018 343 79072 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment