Skip to content

Instantly share code, notes, and snippets.

@allisonmorgan
Last active October 23, 2018 22:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allisonmorgan/aa29935affdf2b2392c9da56d74f96a6 to your computer and use it in GitHub Desktop.
Save allisonmorgan/aa29935affdf2b2392c9da56d74f96a6 to your computer and use it in GitHub Desktop.
Counting punctuation in DBLP titles
1959 2 80
1960 0 9
1961 0 26
1962 7 147
1963 1 13
1964 4 42
1965 4 75
1966 20 157
1967 37 279
1968 65 549
1969 58 456
1970 43 306
1971 73 631
1972 71 631
1973 106 1007
1974 177 1359
1975 171 1311
1976 272 1639
1977 234 1674
1978 257 1932
1979 319 2156
1980 417 2496
1981 527 2870
1982 580 3223
1983 645 3595
1984 865 4531
1985 806 4490
1986 1176 6139
1987 1055 6523
1988 1402 8250
1989 1653 9820
1990 1860 11719
1991 2272 12637
1992 2404 14395
1993 3075 18354
1994 3843 21959
1995 3853 21587
1996 4009 23786
1997 4899 26848
1998 5535 31607
1999 6668 35135
2000 7815 39933
2001 8589 42756
2002 10529 49407
2003 12651 60270
2004 16443 76917
2005 18539 88419
2006 21628 97463
2007 22791 106399
2008 24231 110326
2009 24679 116249
2010 25779 121596
2011 28523 132934
2012 30868 139126
2013 30765 142647
2014 31439 144993
2015 32187 147075
2016 32574 147512
2017 32049 145898
2018 16562 79072
1959 0 80
1960 0 9
1961 0 26
1962 2 147
1963 1 13
1964 3 42
1965 3 75
1966 15 157
1967 35 279
1968 61 549
1969 64 456
1970 47 306
1971 81 631
1972 103 631
1973 95 1007
1974 125 1359
1975 128 1311
1976 154 1639
1977 185 1674
1978 212 1932
1979 183 2156
1980 274 2496
1981 349 2870
1982 380 3223
1983 443 3595
1984 527 4531
1985 619 4490
1986 870 6139
1987 909 6523
1988 1223 8250
1989 1356 9820
1990 1650 11719
1991 1769 12637
1992 2119 14395
1993 2733 18354
1994 3147 21959
1995 3190 21587
1996 3473 23786
1997 4233 26848
1998 4764 31607
1999 5271 35135
2000 5914 39933
2001 6400 42756
2002 7546 49407
2003 9149 60270
2004 11205 76917
2005 12707 88419
2006 13530 97463
2007 15103 106399
2008 15954 110326
2009 17106 116249
2010 18368 121596
2011 20879 132934
2012 22924 139126
2013 24604 142647
2014 26098 144993
2015 27197 147075
2016 28741 147512
2017 29275 145898
2018 18444 79072
import plot_utils
import csv
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
def get_wilson_score_interval(num_successes, num_failures, alpha=0.05):
""" returns the confidence interval for p, the success probability """
z = stats.norm.ppf(q=1-alpha/2.)
n = num_successes + num_failures
ns = num_successes
nf = num_failures
upper = (ns + .5*z**2)/(n+z**2) + z/(n+z**2)*np.sqrt((ns*nf)/n + .25*z**2)
lower = (ns + .5*z**2)/(n+z**2) - z/(n+z**2)*np.sqrt((ns*nf)/n + .25*z**2)
return (lower, upper)
fig, ax = plt.subplots(figsize=(12, 6))
files = ['colon_count.txt', 'hyphen_count.txt', 'question_count.txt', 'acronym_count.txt', "quote_count.txt"] #'acronym_colon_count.txt']
labels = ['colon', 'hyphen', 'question', 'acronym', 'quote']
for i, color in enumerate(['#539CAF', '#49586E', '#8A696E', '#B06B5D', '#BAA657']): #'#748BD1',
with open(files[i],'r') as tsv:
data = [line.strip().split('\t') for line in tsv]
x = []; y = []; lower_CI = []; upper_CI = [];
for row in data:
if int(row[0]) >= 1975:
x.append(int(row[0]))
y.append(float(row[1])/float(row[2]))
(low, high) = get_wilson_score_interval(int(row[1]), int(row[2]) - int(row[1]))
lower_CI.append(low); upper_CI.append(high);
ax.scatter(x, y, color=color, label = labels[i])
ax.fill_between(x, lower_CI, upper_CI, color = color, alpha = 0.4)
ax.set_xticks(range(1975, 2020, 5))
ax.legend(fontsize=plot_utils.LEGEND_SIZE, frameon=False, ncol=2)
ax.set_ylabel('Frequency')
ax.set_xlabel('Year')
ax.set_title('Relative Counts of Punctuation (Source: DBLP 1975--Present)')
plot_utils.finalize(ax)
plt.tight_layout()
plt.savefig('frequency.pdf', bbox_inches='tight', format='pdf', dpi=1000)
plt.clf()
1959 1 80
1960 0 9
1961 0 26
1962 7 147
1963 0 13
1964 0 42
1965 1 75
1966 1 157
1967 3 279
1968 12 549
1969 2 456
1970 6 306
1971 10 631
1972 18 631
1973 28 1007
1974 56 1359
1975 45 1311
1976 60 1639
1977 68 1674
1978 78 1932
1979 73 2156
1980 108 2496
1981 104 2870
1982 108 3223
1983 153 3595
1984 183 4531
1985 138 4490
1986 258 6139
1987 211 6523
1988 256 8250
1989 268 9820
1990 287 11719
1991 325 12637
1992 367 14395
1993 502 18354
1994 546 21959
1995 423 21587
1996 486 23786
1997 547 26848
1998 736 31607
1999 883 35135
2000 842 39933
2001 1003 42756
2002 1074 49407
2003 1273 60270
2004 1693 76917
2005 1821 88419
2006 2152 97463
2007 2242 106399
2008 2560 110326
2009 2676 116249
2010 2942 121596
2011 3514 132934
2012 3375 139126
2013 3462 142647
2014 3565 144993
2015 3450 147075
2016 2950 147512
2017 2846 145898
2018 1639 79072
package main
import (
"compress/gzip"
"encoding/xml"
"fmt"
"golang.org/x/net/html/charset"
"log"
"os"
"strings"
//"regexp"
"sort"
)
// Structs to unmarshal the DBLP XML data into
// Only unpacks "inproceedings" publications
type InProceedings struct {
Papers []Paper `xml:"inproceedings"`
}
// Common features of publications. Not a complete
// set of attributes.
type Paper struct {
Key string `xml:"key,attr"`
Authors []string `xml:"author"`
Title string `xml:"title"`
Year int `xml:"year"`
Booktitle string `xml:"booktitle"`
EE string `xml:"ee"`
Crossref string `xml:"crossref"`
Url string `xml:"url"`
}
// Since decoder.Decode() will error on special
// characters, the map below will convert these
// foreign language characters
var characters = []string{"a", "e", "o", "u", "i", "c", "b", "s", "y", "n", "ae"}
var suffixes = []string{"acute", "uml", "ring", "zlig", "slash", "cedil", "grave", "circ", "tilde", "lig"}
func build_special_character_list() map[string]string {
var special_characters = make(map[string]string)
for _, character := range characters {
for _, suffix := range suffixes {
special_characters[character+suffix] = character
special_characters[strings.ToUpper(character)+suffix] = character
}
}
// Other random special characters
special_characters["times"] = "*"
special_characters["reg"] = "reg"
special_characters["eth"] = "d"
special_characters["ETH"] = "d"
special_characters["micro"] = "u"
special_characters["thorn"] = "p"
special_characters["THORN"] = "p"
return special_characters
}
func main() {
// Download data from http://dblp.uni-trier.de/xml/,
// change the file path below to point to it
log.Println("Reading in DBLP gzipped file")
f, err := os.Open("dblp.xml.gz")
if err != nil {
log.Fatal(err)
}
defer f.Close()
// Golang can read straight from gzipped files!
gzf, err := gzip.NewReader(f)
if err != nil {
log.Fatal(err)
}
// Start a new XML decoder instance and ask it to
// find the encoding specified in the file's header.
// Specify the special character replacements.
decoder := xml.NewDecoder(gzf)
decoder.CharsetReader = charset.NewReaderLabel
decoder.Entity = build_special_character_list()
// This step takes a while (~2 minutes)
log.Println("Decoding all proceedings from file")
var papers InProceedings
err = decoder.Decode(&papers)
if err != nil {
log.Fatal(err)
}
log.Println("Done decoding")
// log.Printf("Example paper: %+v\n", papers.Papers[0])
counter := make(map[int]int)
total := make(map[int]int)
for _, paper := range papers.Papers {
// matched, _ := regexp.MatchString(".*[A-Z]{3}.*", paper.Title)
// matched := strings.Contains(paper.Title, ":")
// matched := strings.Contains(paper.Title, " - ")
// matched := strings.Contains(paper.Title, "?")
matched := strings.Contains(paper.Title, "\"")
if matched {
counter[paper.Year] += 1
}
total[paper.Year] += 1
}
log.Println(counter)
log.Println(total)
// log.Printf("Number of %s titles: %v\tAverage title length: %v\n", strings.ToUpper(conference), len(titles), float64(avg_length)/float64(len(titles)))
// Write all the papers to a new text file
f, err = os.Create(fmt.Sprintf("quote_count.txt"))
if err != nil{
log.Fatal(err)
}
defer f.Close()
var years []int
for k, _ := range total {
years = append(years, k)
}
sort.Ints(years)
log.Println(years)
for i := range years {
_, err := f.WriteString(fmt.Sprintf("%d\t%d\t%d\n", years[i], counter[years[i]], total[years[i]]))
if err != nil {
log.Fatal(err)
}
}
log.Println("Output data to quote_count.txt")
}
1959 1 80
1960 0 9
1961 0 26
1962 0 147
1963 0 13
1964 0 42
1965 0 75
1966 0 157
1967 3 279
1968 2 549
1969 0 456
1970 1 306
1971 6 631
1972 4 631
1973 7 1007
1974 9 1359
1975 15 1311
1976 11 1639
1977 16 1674
1978 25 1932
1979 17 2156
1980 36 2496
1981 30 2870
1982 29 3223
1983 41 3595
1984 56 4531
1985 63 4490
1986 85 6139
1987 82 6523
1988 107 8250
1989 140 9820
1990 130 11719
1991 150 12637
1992 184 14395
1993 207 18354
1994 296 21959
1995 263 21587
1996 286 23786
1997 331 26848
1998 377 31607
1999 431 35135
2000 472 39933
2001 494 42756
2002 645 49407
2003 709 60270
2004 839 76917
2005 910 88419
2006 936 97463
2007 1050 106399
2008 1223 110326
2009 1327 116249
2010 1330 121596
2011 1587 132934
2012 1745 139126
2013 1974 142647
2014 2032 144993
2015 2166 147075
2016 2223 147512
2017 2340 145898
2018 1363 79072
1959 0 80
1960 0 9
1961 0 26
1962 1 147
1963 0 13
1964 1 42
1965 2 75
1966 1 157
1967 5 279
1968 2 549
1969 2 456
1970 2 306
1971 7 631
1972 12 631
1973 7 1007
1974 11 1359
1975 17 1311
1976 6 1639
1977 14 1674
1978 29 1932
1979 14 2156
1980 32 2496
1981 26 2870
1982 36 3223
1983 22 3595
1984 25 4531
1985 43 4490
1986 46 6139
1987 47 6523
1988 46 8250
1989 66 9820
1990 58 11719
1991 87 12637
1992 91 14395
1993 95 18354
1994 132 21959
1995 113 21587
1996 115 23786
1997 150 26848
1998 162 31607
1999 228 35135
2000 164 39933
2001 209 42756
2002 237 49407
2003 266 60270
2004 351 76917
2005 357 88419
2006 362 97463
2007 418 106399
2008 482 110326
2009 430 116249
2010 471 121596
2011 546 132934
2012 552 139126
2013 607 142647
2014 602 144993
2015 642 147075
2016 672 147512
2017 593 145898
2018 343 79072
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment