Last active
February 18, 2020 17:51
-
-
Save leeper/4cef7695cd2a7f2107c7 to your computer and use it in GitHub Desktop.
Quick analysis of Google's DMCA Data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Google DMCA Transparency Data | |
# 2014-07-24 | |
# https://www.google.com/transparencyreport/removals/copyright/data/ | |
# get the data | |
download.file('http://transparencyreport.storage.googleapis.com/google-websearch-copyright-removals.zip', | |
'localcopy.zip', method = 'curl') | |
unzip('localcopy.zip', files = 'google-websearch-copyright-removals/requests.csv') | |
setwd('./google-websearch-copyright-removals') | |
# Codebook | |
# Request ID An ID unique to each copyright removal request Yes | |
# Date The date and time (in UTC) that the request was received, in ISO 8601 format Yes | |
# Chilling Effects URL URL to the Chilling Effects page documenting the request No | |
# Copyright owner ID The ID number of a unique copyright owner Yes | |
# Copyright owner name The name of the copyright owner associated with the request No | |
# Reporting organization ID The ID number of a unique reporting organization Yes | |
# Reporting organization name The name of the reporting organization associated with the request No | |
# URLs removed The number of URLs removed. Learn more. Yes | |
# URLs for which we took no action The number of URLs for which we took no action. Learn more. Yes | |
# URLs pending review The number of URLs that are still pending review. Learn more. Yes | |
# From Abuser If the request was submitted by someone we believe to be abusing the process Yes | |
classes <- | |
c(requestid = 'integer', | |
date = 'character', | |
url = 'character', | |
ownerid = 'integer', | |
ownername = 'character', | |
orgid = 'integer', | |
orgname = 'character', | |
urls_removed = 'integer', | |
urls_noaction = 'integer', | |
urls_pending = 'integer', | |
abuser = 'logical') | |
d <- read.csv('requests.csv', stringsAsFactors = FALSE, | |
colClasses = classes, col.names = names(classes)) | |
# only look at top reporters | |
n <- 40 | |
# number of DMCA requests | |
tab <- head(sort(table(d$orgname), decreasing = TRUE), n) | |
cols <- rep('black', n) | |
cols[which(names(tab)=='The Publishers Association')] <- 'red' | |
png('barplot.png', width = 800, height = 500) | |
par(mar = c(3,15,1,1)) | |
barplot(tab, | |
main = 'DMCA Requests to Google', | |
horiz = TRUE, | |
space = 0, | |
col = cols, border = 'white', | |
las = 1, | |
cex.names = .75, cex.axis = 1) | |
dev.off() | |
# aggregate number of URLs (rather than number of requests) | |
agg <- aggregate(urls_removed ~ orgname, data = d, FUN = sum) | |
agg <- agg[order(agg$urls_removed, decreasing = TRUE), ] | |
tab2 <- setNames(head(agg$urls_removed, n), head(agg$orgname, n)) | |
cols2 <- rep('black', n) | |
cols2[which(names(tab2)=='The Publishers Association')] <- 'red' | |
png('barplot2.png', width = 800, height = 500) | |
par(mar = c(3,15,1,1)) | |
barplot(tab2, | |
main = 'DMCA URL Takedowns Requested from Google', | |
horiz = TRUE, | |
space = 0, | |
col = cols2, border = 'white', | |
las = 1, | |
cex.names = .75, cex.axis = 1) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment