leeper/google_dmca.R

## google_dmca.R
# Google DMCA Transparency Data

# 2014-07-24

# https://www.google.com/transparencyreport/removals/copyright/data/

# get the data
download.file('http://transparencyreport.storage.googleapis.com/google-websearch-copyright-removals.zip',
              'localcopy.zip', method = 'curl')
unzip('localcopy.zip', files = 'google-websearch-copyright-removals/requests.csv')
setwd('./google-websearch-copyright-removals')

# Codebook
# Request ID	An ID unique to each copyright removal request	Yes
# Date	The date and time (in UTC) that the request was received, in ISO 8601 format	Yes
# Chilling Effects URL	URL to the Chilling Effects page documenting the request	No
# Copyright owner ID	The ID number of a unique copyright owner	Yes
# Copyright owner name	The name of the copyright owner associated with the request	No
# Reporting organization ID	The ID number of a unique reporting organization	Yes
# Reporting organization name	The name of the reporting organization associated with the request	No
# URLs removed	The number of URLs removed. Learn more.	Yes
# URLs for which we took no action	The number of URLs for which we took no action. Learn more.	Yes
# URLs pending review	The number of URLs that are still pending review. Learn more.	Yes
# From Abuser	If the request was submitted by someone we believe to be abusing the process	Yes

classes <-
c(requestid = 'integer',
  date = 'character',
  url = 'character',
  ownerid = 'integer',
  ownername = 'character',
  orgid = 'integer',
  orgname = 'character',
  urls_removed = 'integer',
  urls_noaction = 'integer',
  urls_pending = 'integer',
  abuser = 'logical')
d <- read.csv('requests.csv', stringsAsFactors = FALSE,
              colClasses = classes, col.names = names(classes))

# only look at top reporters
n <- 40

# number of DMCA requests
tab <- head(sort(table(d$orgname), decreasing = TRUE), n)
cols <- rep('black', n)
cols[which(names(tab)=='The Publishers Association')] <- 'red'
png('barplot.png', width = 800, height = 500)
par(mar = c(3,15,1,1))
barplot(tab,
        main = 'DMCA Requests to Google',
        horiz = TRUE,
        space = 0,
        col = cols, border = 'white',
        las = 1,
        cex.names = .75, cex.axis = 1)
dev.off()


# aggregate number of URLs (rather than number of requests)
agg <- aggregate(urls_removed ~ orgname, data = d, FUN = sum)
agg <- agg[order(agg$urls_removed, decreasing = TRUE), ]
tab2 <- setNames(head(agg$urls_removed, n), head(agg$orgname, n))
cols2 <- rep('black', n)
cols2[which(names(tab2)=='The Publishers Association')] <- 'red'
png('barplot2.png', width = 800, height = 500)
par(mar = c(3,15,1,1))
barplot(tab2,
        main = 'DMCA URL Takedowns Requested from Google',
        horiz = TRUE,
        space = 0,
        col = cols2, border = 'white',
        las = 1,
        cex.names = .75, cex.axis = 1)
dev.off()
	# Google DMCA Transparency Data

	# 2014-07-24

	# https://www.google.com/transparencyreport/removals/copyright/data/

	# get the data
	download.file('http://transparencyreport.storage.googleapis.com/google-websearch-copyright-removals.zip',
	'localcopy.zip', method = 'curl')
	unzip('localcopy.zip', files = 'google-websearch-copyright-removals/requests.csv')
	setwd('./google-websearch-copyright-removals')

	# Codebook
	# Request ID An ID unique to each copyright removal request Yes
	# Date The date and time (in UTC) that the request was received, in ISO 8601 format Yes
	# Chilling Effects URL URL to the Chilling Effects page documenting the request No
	# Copyright owner ID The ID number of a unique copyright owner Yes
	# Copyright owner name The name of the copyright owner associated with the request No
	# Reporting organization ID The ID number of a unique reporting organization Yes
	# Reporting organization name The name of the reporting organization associated with the request No
	# URLs removed The number of URLs removed. Learn more. Yes
	# URLs for which we took no action The number of URLs for which we took no action. Learn more. Yes
	# URLs pending review The number of URLs that are still pending review. Learn more. Yes
	# From Abuser If the request was submitted by someone we believe to be abusing the process Yes

	classes <-
	c(requestid = 'integer',
	date = 'character',
	url = 'character',
	ownerid = 'integer',
	ownername = 'character',
	orgid = 'integer',
	orgname = 'character',
	urls_removed = 'integer',
	urls_noaction = 'integer',
	urls_pending = 'integer',
	abuser = 'logical')
	d <- read.csv('requests.csv', stringsAsFactors = FALSE,
	colClasses = classes, col.names = names(classes))

	# only look at top reporters
	n <- 40

	# number of DMCA requests
	tab <- head(sort(table(d$orgname), decreasing = TRUE), n)
	cols <- rep('black', n)
	cols[which(names(tab)=='The Publishers Association')] <- 'red'
	png('barplot.png', width = 800, height = 500)
	par(mar = c(3,15,1,1))
	barplot(tab,
	main = 'DMCA Requests to Google',
	horiz = TRUE,
	space = 0,
	col = cols, border = 'white',
	las = 1,
	cex.names = .75, cex.axis = 1)
	dev.off()


	# aggregate number of URLs (rather than number of requests)
	agg <- aggregate(urls_removed ~ orgname, data = d, FUN = sum)
	agg <- agg[order(agg$urls_removed, decreasing = TRUE), ]
	tab2 <- setNames(head(agg$urls_removed, n), head(agg$orgname, n))
	cols2 <- rep('black', n)
	cols2[which(names(tab2)=='The Publishers Association')] <- 'red'
	png('barplot2.png', width = 800, height = 500)
	par(mar = c(3,15,1,1))
	barplot(tab2,
	main = 'DMCA URL Takedowns Requested from Google',
	horiz = TRUE,
	space = 0,
	col = cols2, border = 'white',
	las = 1,
	cex.names = .75, cex.axis = 1)
	dev.off()