Instantly share code, notes, and snippets.

Embed
What would you like to do?
wikipedia clickstream analysis
all
full-list.txt
new-clean-list.txt
wv_lines.*.tsv
counted.*.tsv
__pycache__

Processing scripts for Wikipedia clickstream

New steps

wget https://dumps.wikimedia.org/other/clickstream/2018-03/clickstream-enwiki-2018-03.tsv.gz
gzip -cd clickstream-enwiki-2018-03.tsv.gz \
    | sort -t $'\t' -k 2,2 \
    | ./squash.py \
    > counted-clickstream-enwiki-2018-03.tsv

squash.py requires much less memory because it takes advantage of the fact that the clickstream data will be sorted based on the curr column. This means it only has to remember the part of the clickstream data up till the next curr page, and can print the "squashed" data as soon as it gets to the next block of curr pages.

Steps (old)

First, optionally run filter_wv.py to filter the raw TSV files to get only the lines of interest.

Second, run show_summary.py to print a summary of the TSV files. The summaries will look like the following:

DOING all/2016_02_en_clickstream.tsv
TOTAL SIZE: 6695402206
PREV!=Wikipedia: 1770548247
PREV== other-internal 623664
PREV== other-search 0
PREV== other-external 0
PREV== other-empty 1648385145
PREV== other-other 121539438

DOING all/2016_03_en_clickstream.tsv
TOTAL SIZE: 6814906943
PREV!=Wikipedia: 2263238989
PREV== other-internal 545915
PREV== other-search 0
PREV== other-external 0
PREV== other-empty 2156754015
PREV== other-other 105939059
import sys
import argparse
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import datetime
# from dateutil.relativedelta import relativedelta
# import matplotlib.dates as dates
from util import *
def main():
parser = argparse.ArgumentParser()
parser.add_argument("tsv", type=argparse.FileType('r'), nargs='*',
default=[sys.stdin],
help="the TSV files for which to print a summary; " +
"can read one file from stdin")
parser.add_argument("--filter_list", type=argparse.FileType('r'),
required=True,
help="filename of filter list; each line in the file is a "
"page title")
parser.add_argument("--output_prefix", type=str,
default="wv_lines.",
help='prefix to use for output TSV files; ' +
'defaults to "wv_lines."')
args = parser.parse_args()
wv_titles = set()
for line in args.filter_list:
title = line.strip().replace(' ', '_')
wv_titles.add(title)
for f in args.tsv:
print("DOING", f)
df = get_df(f)
if "curr" in df:
# For 2016 TSVs
curr_col = "curr"
else:
# For 2015 TSVs
curr_col = "curr_title"
df[df[curr_col].isin(wv_titles)].to_csv(args.output_prefix + "%s" %
f.name.split('/')[-1], sep="\t", index=False)
if __name__ == "__main__":
main()
DOING <_io.TextIOWrapper name='all/2015_01_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 4036460178
PREV != Wikipedia 2938673785
PREV == other-wikipedia 99429039
PREV == other-internal 224136
PREV == other-google 1579464733
PREV == other-yahoo 48987014
PREV == other-bing 64418112
PREV == other-facebook 2595491
PREV == other-twitter 22517945
PREV == other-empty 1037955460
PREV == other 83081855
DOING <_io.TextIOWrapper name='all/2015_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 3279134602
PREV != Wikipedia 2107923648
PREV == other-wikipedia 129619543
PREV == other-internal 340707
PREV == other-google 1494662520
PREV == other-yahoo 48445941
PREV == other-bing 65895496
PREV == other-facebook 2312328
PREV == other-twitter 19222486
PREV == other-empty 347424627
PREV == other 0
DOING <_io.TextIOWrapper name='all/2016_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 6695402206
PREV != Wikipedia 1770548247
PREV == other-internal 623664
PREV == other-search 0
PREV == other-external 0
PREV == other-empty 1648385145
PREV == other-other 121539438
DOING <_io.TextIOWrapper name='all/2016_03_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 6814906943
PREV != Wikipedia 2263238989
PREV == other-internal 545915
PREV == other-search 0
PREV == other-external 0
PREV == other-empty 2156754015
PREV == other-other 105939059
DOING <_io.TextIOWrapper name='all/2016_04_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 6375609462
PREV != Wikipedia 5211164904
PREV == other-internal 107289998
PREV == other-search 2921325072
PREV == other-external 132805440
PREV == other-empty 2049584695
PREV == other-other 159699
DOING <_io.TextIOWrapper name='all/2016_08_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 7502925411
PREV != Wikipedia 6199514320
PREV == other-internal 116612459
PREV == other-search 2895236000
PREV == other-external 158019950
PREV == other-empty 3029460248
PREV == other-other 185663
DOING <_io.TextIOWrapper name='all/2016_09_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 6844156526
PREV != Wikipedia 5614311204
PREV == other-internal 116016558
PREV == other-search 2979102403
PREV == other-external 162445411
PREV == other-empty 2356531976
PREV == other-other 214856
import sys
import argparse
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import datetime
# from dateutil.relativedelta import relativedelta
# import matplotlib.dates as dates
from util import *
def group_func(x):
'''
Make a union out of all the Wikipedia page titles. This is useful in a
groupby because otherwise all of the Wikipedia page titles will form
separate groups.
'''
if x in SPECIAL_NAMES_2015 or x in SPECIAL_NAMES:
return x
else:
return "Wikipedia"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("tsv", type=argparse.FileType('r'), nargs='*',
default=[sys.stdin],
help="the TSV files for which to print a summary; " +
"can read one file from stdin")
parser.add_argument("--output_prefix", type=str,
default="counted.",
help='prefix to use for output TSV files; ' +
'defaults to "counted."')
args = parser.parse_args()
for f in args.tsv:
print("DOING", f)
df = get_df(f)
if "curr" in df:
# For 2016 TSVs
curr_col = "curr"
prev_col = "prev"
else:
# For 2015 TSVs
curr_col = "curr_title"
prev_col = "prev_title"
df['grouped_prev'] = df[prev_col].map(group_func)
grouped = df.groupby([curr_col, 'grouped_prev'])
grouped['n'].sum().to_csv(args.output_prefix + f.name.split('/')[-1],
sep="\t")
if __name__ == "__main__":
main()
page_title sum_of_n desktop_pageviews mobile_pageviews Wikipedia other-empty other-external other-internal other-other other-search
September 2016:
Scala_(programming_language) 43506 36K 8.4K 8189 5951 1801 925 0 26640
PHP 119222 94K 25K 11384 22426 4588 5318 44 75462
Taylor_Swift 481458 196K 282K 65814 99622 5528 14926 0 295568
Carly_Rae_Jepsen 74378 43K 33K 9653 15289 1259 2141 0 46036
August 2016:
Hillary_Clinton 963191 427K 476K 112048 222696 17963 27906 41 582537
import sys
import argparse
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import datetime
# from dateutil.relativedelta import relativedelta
# import matplotlib.dates as dates
from util import *
def main():
parser = argparse.ArgumentParser()
parser.add_argument("tsv", type=argparse.FileType('r'), nargs='*',
default=[sys.stdin],
help="the TSV files for which to print a summary; " +
"can read one file from stdin")
args = parser.parse_args()
for f in args.tsv:
print("DOING", f)
df = get_df(f)
print("TOTAL SIZE:", df['n'].sum())
if "prev" in df:
# We have a 2016 TSV:
# prev curr type n
prev_col = "prev"
special_lst = SPECIAL_NAMES
else:
# This means it is a 2015 TSV, so the columns are:
# prev_id curr_id n prev_title curr_title type
prev_col = "prev_title"
special_lst = SPECIAL_NAMES_2015
print("PREV != Wikipedia", df[df[prev_col].isin(special_lst)]['n'].sum())
for i in special_lst:
print("PREV ==", i, df[df[prev_col] == i]['n'].sum())
print("")
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# License: CC0
import sys
SPECIAL_NAMES = [
"other-internal",
"other-search",
"other-external",
"other-empty",
"other-other",
]
def grouped(prev):
if prev in SPECIAL_NAMES:
return prev
return "Wikipedia"
def main():
current_curr = None
counts = {}
for line in sys.stdin:
prev, curr, type_col, n_col = line.rstrip().split("\t")
if curr == current_curr:
counts[grouped(prev)] = counts.get(grouped(prev), 0) + int(n_col)
else:
# We are on to the next block of "curr" values, so first print the
# existing sum. The "None" case happens on the very first line, so
# don't print then.
if current_curr is not None:
for key in counts:
print("{}\t{}\t{}".format(current_curr, key, counts[key]))
current_curr = curr
counts = {grouped(prev): int(n_col)}
# Print the final "curr" group.
for key in counts:
print("{}\t{}\t{}".format(current_curr, key, counts[key]))
if __name__ == "__main__":
main()
import pandas as pd
def get_df(fname):
df = pd.read_csv(fname, sep='\t')
return df
SPECIAL_NAMES = [
"other-internal",
"other-search",
"other-external",
"other-empty",
"other-other",
]
SPECIAL_NAMES_2015 = [
"other-wikipedia",
"other-internal",
"other-google",
"other-yahoo",
"other-bing",
"other-facebook",
"other-twitter",
"other-empty",
"other",
]
DOING <_io.TextIOWrapper name='wv_lines.2015_01_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 759804934
PREV != Wikipedia 556163800
PREV == other-wikipedia 24192548
PREV == other-internal 77634
PREV == other-google 357252461
PREV == other-yahoo 13318672
PREV == other-bing 16964243
PREV == other-facebook 411390
PREV == other-twitter 4664164
PREV == other-empty 118854661
PREV == other 20428027
DOING <_io.TextIOWrapper name='wv_lines.2015_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 674625332
PREV != Wikipedia 455598019
PREV == other-wikipedia 29076280
PREV == other-internal 113385
PREV == other-google 339674595
PREV == other-yahoo 13062424
PREV == other-bing 17131432
PREV == other-facebook 399093
PREV == other-twitter 3181712
PREV == other-empty 52959098
PREV == other 0
DOING <_io.TextIOWrapper name='wv_lines.2016_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1379628229
PREV != Wikipedia 315515278
PREV == other-internal 220124
PREV == other-search 0
PREV == other-external 0
PREV == other-empty 274718693
PREV == other-other 40576461
DOING <_io.TextIOWrapper name='wv_lines.2016_03_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1345050536
PREV != Wikipedia 384721197
PREV == other-internal 190293
PREV == other-search 0
PREV == other-external 0
PREV == other-empty 361275167
PREV == other-other 23255737
DOING <_io.TextIOWrapper name='wv_lines.2016_04_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1224895287
PREV != Wikipedia 1016723738
PREV == other-internal 25621783
PREV == other-search 640761551
PREV == other-external 26412638
PREV == other-empty 323878099
PREV == other-other 49667
DOING <_io.TextIOWrapper name='wv_lines.2016_08_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1206267682
PREV != Wikipedia 985040545
PREV == other-internal 29268536
PREV == other-search 583945447
PREV == other-external 31137710
PREV == other-empty 340626516
PREV == other-other 62336
DOING <_io.TextIOWrapper name='wv_lines.2016_09_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1210079281
PREV != Wikipedia 1000837228
PREV == other-internal 28906136
PREV == other-search 611235854
PREV == other-external 30412054
PREV == other-empty 330189620
PREV == other-other 93564
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment