Skip to content

Instantly share code, notes, and snippets.

@agoldst
Last active December 27, 2015 19:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save agoldst/7378353 to your computer and use it in GitHub Desktop.
Save agoldst/7378353 to your computer and use it in GitHub Desktop.
# for this file, clone http://github.com/agoldst/dfr-analysis
source("~/Developer/dfr-analysis/metadata.R")
library(plyr)
library(stringr)
wordcounts_v <- function (f) {
frm <- scan(f,what=list(word=character(),weight=integer()),sep=",",skip=1,quiet=T)
result <- frm$weight
names(result) <- frm$word
result
}
wordcounts_dist <- function(a_v,b_v,normalize=F,method="taxicab") {
a_v[setdiff(names(b_v),names(a_v))] <- 0
b_v[setdiff(names(a_v),names(b_v))] <- 0
b_v <- b_v[names(a_v)]
if(normalize) {
a_v <- a_v / sum(a_v)
b_v <- b_v / sum(b_v)
}
switch(method,
squareddiff=sum((a_v - b_v) * (a_v - b_v)),
taxicab=sum(abs(a_v - b_v)))
}
check_meta <- function(f_a,f_b) {
a <- read_metadata(f_a)
b <- read_metadata(f_b)
a <- a[order(a$id),]
b <- b[order(b$id),]
all(a == b)
}
pdfgrep_check <- function(f,words) {
result <- numeric(length(words))
names(result) <- words
for(w in words) {
result[w] <- as.numeric(try(
system(str_c("pdfgrep -c -i '[[:<:]]",w,"[[:>:]]' ",f),intern=T)))
}
result
}
data_quality <- function(
before_path="/Users/agoldst/Documents/signs-model/data/20130919",
after_path="/Users/agoldst/Documents/signs-model/data/20131105",
LOG_INTERVAL=500) {
result <- read_metadata(file.path(before_path,"citations.CSV"))
message(ifelse(check_meta(file.path(before_path,"citations.CSV"),
file.path(after_path,"citations.CSV")),
"all metadata match",
"metadata do not match"))
distance <- numeric(nrow(result))
length_before <- numeric(nrow(result))
length_after <- numeric(nrow(result))
for(i in seq_along(result$id)) {
if(i %% LOG_INTERVAL == 0) {
message("Processed ",i," file pairs")
}
id <- result$id[i]
v_before <- wordcounts_v(file.path(before_path,"wordcounts",as.filename(id)))
v_after <- wordcounts_v(file.path(after_path,"wordcounts",as.filename(id)))
distance[i] <- wordcounts_dist(v_before,v_after,
normalize=F,method="taxicab")
length_before[i] <- sum(v_before)
length_after[i] <- sum(v_after)
}
result$dist <- distance
result$length_before <- length_before
result$length_after <- length_after
result
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment