Use R to analyse a large text file that is too big to read in all at once
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(chunked) | |
library(tidyverse) | |
# I want to look at the daily page views of Wikipedia articles | |
# before 2015... I can get zipped log files | |
# from here: https://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/ | |
# I get bz file, unzip to get this: | |
my_file <- 'pagecounts-2012-12-14/pagecounts-2012-12-14' | |
# How big is my file? | |
print(paste(round(file.info(my_file)$size / 2^30,3), 'gigabytes')) | |
# [1] "3.493 gigabytes" too big to open in Notepad++ ! | |
# But can read with 010 Editor | |
# look at the top of the file | |
readLines(my_file, n = 100) | |
# to find where the content starts, vary the skip value, | |
read.table(my_file, nrows = 10, skip = 25) | |
# Let the chunked pkg work its magic! We only want the lines containing | |
# "Gun_control". The main challenge here was identifying the column | |
# header | |
df <- | |
read_chunkwise(my_file, | |
chunk_size=5000, | |
skip = 30, | |
format = "table", | |
header = TRUE) %>% | |
filter(stringr::str_detect(De.mw.De.5.J3M1O1, "Gun_control")) | |
# this line does the evaluation, | |
# and takes a few moments... | |
system.time(out <- collect(df)) | |
# clean up the output to separate into cols, | |
# and get the number of page views as a numeric | |
out_df <- | |
out %>% | |
separate(De.mw.De.5.J3M1O1, | |
into = str_glue("V{1:4}"), | |
sep = " ") %>% | |
mutate(V3 = as.numeric(V3)) | |
head(out_df) | |
V1 V2 V3 | |
1 en.z Gun_control 7961 | |
2 en.z Category:Gun_control_advocacy_groups_in_the_United_States 1396 | |
3 en.z Gun_control_policy_of_the_Clinton_Administration 223 | |
4 en.z Category:Gun_control_advocates 80 | |
5 en.z Gun_control_in_the_United_Kingdom 68 | |
6 en.z Gun_control_in_america 59 | |
V4 | |
1 A34B55C32D38E32F32G32H20I22J9K12L10M9N15O34P38Q37R83S197T1207U1643V1523W1528X1319 | |
2 B1C5D2E1F3H3J1O1P3Q9R9S23T197U327V245W271X295 | |
3 A3B2C4D2E3F3G1J3K1L1O3P2Q2R4S2T24U39V41W43X40 | |
4 D2H1M1S4T8U22V10W18X14 | |
5 B1C1S1T11U12V13W16X13 | |
6 B1H1M1N2P1S1T6U5V17W12X12 | |
#-------------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(readr) | |
library(tidyverse) | |
# I want to look at the daily page views of Wikipedia articles | |
# before 2015... I can get zipped log files | |
# from here: https://dumps.wikimedia.org/other/pagecounts-ez/merged/2012/2012-12/ | |
# I get bz file, unzip to get this: | |
my_file <- 'pagecounts-2012-12-14/pagecounts-2012-12-14' | |
# read_delim_chunked is nice because it tells us the column names | |
# when we test it | |
# view structure of each chunk | |
read_lines_chunked(my_file, str, chunk_size = 5) | |
# Print starting line of each chunk | |
f <- function(x, pos) print(pos) | |
read_lines_chunked(my_file, SideEffectChunkCallback$new(f), chunk_size = 5) | |
# Keep all of a string that matches a pattern in the string | |
f <- function(x, pos) stringr::str_subset(x, "Gun_control") | |
rdc <- | |
read_lines_chunked(my_file, | |
ListCallback$new(f), | |
skip = 30) | |
rdc_chr <- unlist(rdc) | |
rdc_chr | |
[1] "commons.m Category:Gun_controllers 3 C1G1T1" | |
[2] "en.q Gun_control 4 N1U1V1X1" | |
[3] "en.z Category%25253AGun_control_advocacy_groups_in_the_United_States 1 U1" | |
[4] "en.z Category%3AGun_control_advocacy_groups_in_the_United_States 7 U3X4" | |
[5] "en.z Category:Gun_control_advocacy_groups_in_the_United_States 1396 B1C5D2E1F3H3J1O1P3Q9R9S23T197U327V245W271X295" | |
[6] "en.z Category:Gun_control_advocacy_groups_in_the_United_States) 4 W3X1" | |
[7] "en.z Category:Gun_control_advocates 80 D2H1M1S4T8U22V10W18X14" | |
[8] "en.z Dunblane_massacre%23Gun_control 6 U6" | |
[9] "en.z Dunblane_school_massacre%23Gun_control 33 S10T4U10V5W4" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment