benmarwick/analyse_large_text_file_chunked.R

## analyse_large_text_file_chunked.R
library(chunked)
library(tidyverse)

# I want to look at the daily page views of Wikipedia articles
# before 2015... I can get zipped log files
# from here: https://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/
# I get bz file, unzip to get this:

my_file <- 'pagecounts-2012-12-14/pagecounts-2012-12-14'

# How big is my file?
print(paste(round(file.info(my_file)$size  / 2^30,3), 'gigabytes'))
# [1] "3.493 gigabytes" too big to open in Notepad++ !
# But can read with 010 Editor

# look at the top of the file
readLines(my_file, n = 100)

# to find where the content starts, vary the skip value,
read.table(my_file, nrows = 10, skip = 25)

# Let the chunked pkg work its magic! We only want the lines containing
# "Gun_control". The main challenge here was identifying the column
# header
df <-
read_chunkwise(my_file,
               chunk_size=5000,
               skip = 30,
               format = "table",
               header = TRUE) %>%
  filter(stringr::str_detect(De.mw.De.5.J3M1O1, "Gun_control"))

# this line does the evaluation,
# and takes a few moments...
system.time(out <- collect(df))

# clean up the output to separate into cols,
# and get the number of page views as a numeric
out_df <-
out %>%
  separate(De.mw.De.5.J3M1O1,
           into = str_glue("V{1:4}"),
           sep = " ") %>%
  mutate(V3 = as.numeric(V3))

 head(out_df)
    V1                                                        V2   V3
1 en.z                                               Gun_control 7961
2 en.z Category:Gun_control_advocacy_groups_in_the_United_States 1396
3 en.z          Gun_control_policy_of_the_Clinton_Administration  223
4 en.z                            Category:Gun_control_advocates   80
5 en.z                         Gun_control_in_the_United_Kingdom   68
6 en.z                                    Gun_control_in_america   59
                                                                                 V4
1 A34B55C32D38E32F32G32H20I22J9K12L10M9N15O34P38Q37R83S197T1207U1643V1523W1528X1319
2                                     B1C5D2E1F3H3J1O1P3Q9R9S23T197U327V245W271X295
3                                     A3B2C4D2E3F3G1J3K1L1O3P2Q2R4S2T24U39V41W43X40
4                                                            D2H1M1S4T8U22V10W18X14
5                                                             B1C1S1T11U12V13W16X13
6                                                         B1H1M1N2P1S1T6U5V17W12X12

#--------------------

## analyse_large_text_file_readr.R
library(readr)
library(tidyverse)

# I want to look at the daily page views of Wikipedia articles
# before 2015... I can get zipped log files
# from here: https://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/
# I get bz file, unzip to get this:

my_file <- 'pagecounts-2012-12-14/pagecounts-2012-12-14'

# read_delim_chunked is nice because it tells us the column names
# when we test it

# view structure of each chunk
read_lines_chunked(my_file, str, chunk_size = 5)

# Print starting line of each chunk
f <- function(x, pos) print(pos)
read_lines_chunked(my_file, SideEffectChunkCallback$new(f), chunk_size = 5)


# Keep all of a string that matches a pattern in the string
f <- function(x, pos) stringr::str_subset(x, "Gun_control")

rdc <-
  read_lines_chunked(my_file,
                     ListCallback$new(f),
                  skip = 30)

rdc_chr <- unlist(rdc)
rdc_chr
[1] "commons.m Category:Gun_controllers 3 C1G1T1"
[2] "en.q Gun_control 4 N1U1V1X1"
[3] "en.z Category%25253AGun_control_advocacy_groups_in_the_United_States 1 U1"
[4] "en.z Category%3AGun_control_advocacy_groups_in_the_United_States 7 U3X4"
[5] "en.z Category:Gun_control_advocacy_groups_in_the_United_States 1396 B1C5D2E1F3H3J1O1P3Q9R9S23T197U327V245W271X295"
[6] "en.z Category:Gun_control_advocacy_groups_in_the_United_States) 4 W3X1"
[7] "en.z Category:Gun_control_advocates 80 D2H1M1S4T8U22V10W18X14"
[8] "en.z Dunblane_massacre%23Gun_control 6 U6"
[9] "en.z Dunblane_school_massacre%23Gun_control 33 S10T4U10V5W4"
	library(chunked)
	library(tidyverse)

	# I want to look at the daily page views of Wikipedia articles
	# before 2015... I can get zipped log files
	# from here: https://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/
	# I get bz file, unzip to get this:

	my_file <- 'pagecounts-2012-12-14/pagecounts-2012-12-14'

	# How big is my file?
	print(paste(round(file.info(my_file)$size / 2^30,3), 'gigabytes'))
	# [1] "3.493 gigabytes" too big to open in Notepad++ !
	# But can read with 010 Editor

	# look at the top of the file
	readLines(my_file, n = 100)

	# to find where the content starts, vary the skip value,
	read.table(my_file, nrows = 10, skip = 25)

	# Let the chunked pkg work its magic! We only want the lines containing
	# "Gun_control". The main challenge here was identifying the column
	# header
	df <-
	read_chunkwise(my_file,
	chunk_size=5000,
	skip = 30,
	format = "table",
	header = TRUE) %>%
	filter(stringr::str_detect(De.mw.De.5.J3M1O1, "Gun_control"))

	# this line does the evaluation,
	# and takes a few moments...
	system.time(out <- collect(df))

	# clean up the output to separate into cols,
	# and get the number of page views as a numeric
	out_df <-
	out %>%
	separate(De.mw.De.5.J3M1O1,
	into = str_glue("V{1:4}"),
	sep = " ") %>%
	mutate(V3 = as.numeric(V3))

	head(out_df)
	V1 V2 V3
	1 en.z Gun_control 7961
	2 en.z Category:Gun_control_advocacy_groups_in_the_United_States 1396
	3 en.z Gun_control_policy_of_the_Clinton_Administration 223
	4 en.z Category:Gun_control_advocates 80
	5 en.z Gun_control_in_the_United_Kingdom 68
	6 en.z Gun_control_in_america 59
	V4
	1 A34B55C32D38E32F32G32H20I22J9K12L10M9N15O34P38Q37R83S197T1207U1643V1523W1528X1319
	2 B1C5D2E1F3H3J1O1P3Q9R9S23T197U327V245W271X295
	3 A3B2C4D2E3F3G1J3K1L1O3P2Q2R4S2T24U39V41W43X40
	4 D2H1M1S4T8U22V10W18X14
	5 B1C1S1T11U12V13W16X13
	6 B1H1M1N2P1S1T6U5V17W12X12

	#--------------------
	library(readr)
	library(tidyverse)

	# I want to look at the daily page views of Wikipedia articles
	# before 2015... I can get zipped log files
	# from here: https://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/
	# I get bz file, unzip to get this:

	my_file <- 'pagecounts-2012-12-14/pagecounts-2012-12-14'

	# read_delim_chunked is nice because it tells us the column names
	# when we test it

	# view structure of each chunk
	read_lines_chunked(my_file, str, chunk_size = 5)

	# Print starting line of each chunk
	f <- function(x, pos) print(pos)
	read_lines_chunked(my_file, SideEffectChunkCallback$new(f), chunk_size = 5)


	# Keep all of a string that matches a pattern in the string
	f <- function(x, pos) stringr::str_subset(x, "Gun_control")

	rdc <-
	read_lines_chunked(my_file,
	ListCallback$new(f),
	skip = 30)

	rdc_chr <- unlist(rdc)
	rdc_chr
	[1] "commons.m Category:Gun_controllers 3 C1G1T1"
	[2] "en.q Gun_control 4 N1U1V1X1"
	[3] "en.z Category%25253AGun_control_advocacy_groups_in_the_United_States 1 U1"
	[4] "en.z Category%3AGun_control_advocacy_groups_in_the_United_States 7 U3X4"
	[5] "en.z Category:Gun_control_advocacy_groups_in_the_United_States 1396 B1C5D2E1F3H3J1O1P3Q9R9S23T197U327V245W271X295"
	[6] "en.z Category:Gun_control_advocacy_groups_in_the_United_States) 4 W3X1"
	[7] "en.z Category:Gun_control_advocates 80 D2H1M1S4T8U22V10W18X14"
	[8] "en.z Dunblane_massacre%23Gun_control 6 U6"
	[9] "en.z Dunblane_school_massacre%23Gun_control 33 S10T4U10V5W4"