riceissa/.gitignore

## .gitignore
all
full-list.txt
new-clean-list.txt
wv_lines.*.tsv
counted.*.tsv
__pycache__

## README.md

      
    Raw
  

              README.md
            
          
    Processing scripts for Wikipedia clickstream

New steps

wget https://dumps.wikimedia.org/other/clickstream/2018-03/clickstream-enwiki-2018-03.tsv.gz
gzip -cd clickstream-enwiki-2018-03.tsv.gz \
    | sort -t $'\t' -k 2,2 \
    | ./squash.py \
    > counted-clickstream-enwiki-2018-03.tsv
squash.py requires much less memory because it takes advantage of the fact
that the clickstream data will be sorted based on the curr column. This means
it only has to remember the part of the clickstream data up till the next
curr page, and can print the "squashed" data as soon as it gets to the next
block of curr pages.
Steps (old)

First, optionally run filter_wv.py to filter the raw TSV files to get only
the lines of interest.
Second, run show_summary.py to print a summary of the TSV files. The
summaries will look like the following:
DOING all/2016_02_en_clickstream.tsv
TOTAL SIZE: 6695402206
PREV!=Wikipedia: 1770548247
PREV== other-internal 623664
PREV== other-search 0
PREV== other-external 0
PREV== other-empty 1648385145
PREV== other-other 121539438

DOING all/2016_03_en_clickstream.tsv
TOTAL SIZE: 6814906943
PREV!=Wikipedia: 2263238989
PREV== other-internal 545915
PREV== other-search 0
PREV== other-external 0
PREV== other-empty 2156754015
PREV== other-other 105939059


## filter_wv.py
import sys
import argparse
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import datetime
# from dateutil.relativedelta import relativedelta
# import matplotlib.dates as dates

from util import *

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("tsv", type=argparse.FileType('r'), nargs='*',
            default=[sys.stdin],
            help="the TSV files for which to print a summary; " +
            "can read one file from stdin")
    parser.add_argument("--filter_list", type=argparse.FileType('r'),
            required=True,
            help="filename of filter list; each line in the file is a "
            "page title")
    parser.add_argument("--output_prefix", type=str,
            default="wv_lines.",
            help='prefix to use for output TSV files; ' +
            'defaults to "wv_lines."')
    args = parser.parse_args()
    wv_titles = set()
    for line in args.filter_list:
        title = line.strip().replace(' ', '_')
        wv_titles.add(title)
    for f in args.tsv:
        print("DOING", f)
        df = get_df(f)
        if "curr" in df:
            # For 2016 TSVs
            curr_col = "curr"
        else:
            # For 2015 TSVs
            curr_col = "curr_title"
        df[df[curr_col].isin(wv_titles)].to_csv(args.output_prefix + "%s" %
                f.name.split('/')[-1], sep="\t", index=False)

if __name__ == "__main__":
    main()

## long_out.txt
DOING <_io.TextIOWrapper name='all/2015_01_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 4036460178
PREV != Wikipedia 2938673785
PREV == other-wikipedia 99429039
PREV == other-internal 224136
PREV == other-google 1579464733
PREV == other-yahoo 48987014
PREV == other-bing 64418112
PREV == other-facebook 2595491
PREV == other-twitter 22517945
PREV == other-empty 1037955460
PREV == other 83081855

DOING <_io.TextIOWrapper name='all/2015_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 3279134602
PREV != Wikipedia 2107923648
PREV == other-wikipedia 129619543
PREV == other-internal 340707
PREV == other-google 1494662520
PREV == other-yahoo 48445941
PREV == other-bing 65895496
PREV == other-facebook 2312328
PREV == other-twitter 19222486
PREV == other-empty 347424627
PREV == other 0

DOING <_io.TextIOWrapper name='all/2016_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 6695402206
PREV != Wikipedia 1770548247
PREV == other-internal 623664
PREV == other-search 0
PREV == other-external 0
PREV == other-empty 1648385145
PREV == other-other 121539438

DOING <_io.TextIOWrapper name='all/2016_03_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 6814906943
PREV != Wikipedia 2263238989
PREV == other-internal 545915
PREV == other-search 0
PREV == other-external 0
PREV == other-empty 2156754015
PREV == other-other 105939059

DOING <_io.TextIOWrapper name='all/2016_04_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 6375609462
PREV != Wikipedia 5211164904
PREV == other-internal 107289998
PREV == other-search 2921325072
PREV == other-external 132805440
PREV == other-empty 2049584695
PREV == other-other 159699

DOING <_io.TextIOWrapper name='all/2016_08_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 7502925411
PREV != Wikipedia 6199514320
PREV == other-internal 116612459
PREV == other-search 2895236000
PREV == other-external 158019950
PREV == other-empty 3029460248
PREV == other-other 185663

DOING <_io.TextIOWrapper name='all/2016_09_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 6844156526
PREV != Wikipedia 5614311204
PREV == other-internal 116016558
PREV == other-search 2979102403
PREV == other-external 162445411
PREV == other-empty 2356531976
PREV == other-other 214856


## sanity_check.py
import sys
import argparse
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import datetime
# from dateutil.relativedelta import relativedelta
# import matplotlib.dates as dates

from util import *

def group_func(x):
    '''
    Make a union out of all the Wikipedia page titles. This is useful in a
    groupby because otherwise all of the Wikipedia page titles will form
    separate groups.
    '''
    if x in SPECIAL_NAMES_2015 or x in SPECIAL_NAMES:
        return x
    else:
        return "Wikipedia"

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("tsv", type=argparse.FileType('r'), nargs='*',
            default=[sys.stdin],
            help="the TSV files for which to print a summary; " +
            "can read one file from stdin")
    parser.add_argument("--output_prefix", type=str,
            default="counted.",
            help='prefix to use for output TSV files; ' +
            'defaults to "counted."')
    args = parser.parse_args()
    for f in args.tsv:
        print("DOING", f)
        df = get_df(f)
        if "curr" in df:
            # For 2016 TSVs
            curr_col = "curr"
            prev_col = "prev"
        else:
            # For 2015 TSVs
            curr_col = "curr_title"
            prev_col = "prev_title"
        df['grouped_prev'] = df[prev_col].map(group_func)
        grouped = df.groupby([curr_col, 'grouped_prev'])
        grouped['n'].sum().to_csv(args.output_prefix + f.name.split('/')[-1],
                sep="\t")

if __name__ == "__main__":
    main()

## sanity_results.txt
page_title			sum_of_n	desktop_pageviews	mobile_pageviews	Wikipedia	other-empty	other-external	other-internal	other-other	other-search
September 2016:
Scala_(programming_language)	43506		36K			8.4K			8189		5951		1801		925		0		26640
PHP				119222		94K			25K			11384		22426		4588		5318		44		75462
Taylor_Swift			481458		196K			282K			65814		99622		5528		14926		0		295568
Carly_Rae_Jepsen		74378		43K			33K			9653		15289		1259		2141		0		46036

August 2016:
Hillary_Clinton			963191		427K			476K			112048		222696		17963		27906		41		582537

## show_summary.py
import sys
import argparse
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# import datetime
# from dateutil.relativedelta import relativedelta
# import matplotlib.dates as dates

from util import *

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("tsv", type=argparse.FileType('r'), nargs='*',
            default=[sys.stdin],
            help="the TSV files for which to print a summary; " +
            "can read one file from stdin")
    args = parser.parse_args()
    for f in args.tsv:
        print("DOING", f)
        df = get_df(f)
        print("TOTAL SIZE:", df['n'].sum())
        if "prev" in df:
            # We have a 2016 TSV:
            #       prev curr type n
            prev_col = "prev"
            special_lst = SPECIAL_NAMES
        else:
            # This means it is a 2015 TSV, so the columns are:
            #       prev_id curr_id n prev_title curr_title type
            prev_col = "prev_title"
            special_lst = SPECIAL_NAMES_2015
        print("PREV != Wikipedia", df[df[prev_col].isin(special_lst)]['n'].sum())
        for i in special_lst:
            print("PREV ==", i, df[df[prev_col] == i]['n'].sum())
        print("")

if __name__ == "__main__":
    main()

## squash.py
#!/usr/bin/env python3
# License: CC0

import sys


SPECIAL_NAMES = [
        "other-internal",
        "other-search",
        "other-external",
        "other-empty",
        "other-other",
]


def grouped(prev):
    if prev in SPECIAL_NAMES:
        return prev
    return "Wikipedia"


def main():
    current_curr = None
    counts = {}
    for line in sys.stdin:
        prev, curr, type_col, n_col = line.rstrip().split("\t")
        if curr == current_curr:
            counts[grouped(prev)] = counts.get(grouped(prev), 0) + int(n_col)
        else:
            # We are on to the next block of "curr" values, so first print the
            # existing sum. The "None" case happens on the very first line, so
            # don't print then.
            if current_curr is not None:
                for key in counts:
                    print("{}\t{}\t{}".format(current_curr, key, counts[key]))
            current_curr = curr
            counts = {grouped(prev): int(n_col)}
    # Print the final "curr" group.
    for key in counts:
        print("{}\t{}\t{}".format(current_curr, key, counts[key]))


if __name__ == "__main__":
    main()

## util.py
import pandas as pd

def get_df(fname):
    df = pd.read_csv(fname, sep='\t')
    return df

SPECIAL_NAMES = [
        "other-internal",
        "other-search",
        "other-external",
        "other-empty",
        "other-other",
]

SPECIAL_NAMES_2015 = [
        "other-wikipedia",
        "other-internal",
        "other-google",
        "other-yahoo",
        "other-bing",
        "other-facebook",
        "other-twitter",
        "other-empty",
        "other",
]

## wv_pages_summary.txt
DOING <_io.TextIOWrapper name='wv_lines.2015_01_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 759804934
PREV != Wikipedia 556163800
PREV == other-wikipedia 24192548
PREV == other-internal 77634
PREV == other-google 357252461
PREV == other-yahoo 13318672
PREV == other-bing 16964243
PREV == other-facebook 411390
PREV == other-twitter 4664164
PREV == other-empty 118854661
PREV == other 20428027

DOING <_io.TextIOWrapper name='wv_lines.2015_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 674625332
PREV != Wikipedia 455598019
PREV == other-wikipedia 29076280
PREV == other-internal 113385
PREV == other-google 339674595
PREV == other-yahoo 13062424
PREV == other-bing 17131432
PREV == other-facebook 399093
PREV == other-twitter 3181712
PREV == other-empty 52959098
PREV == other 0

DOING <_io.TextIOWrapper name='wv_lines.2016_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1379628229
PREV != Wikipedia 315515278
PREV == other-internal 220124
PREV == other-search 0
PREV == other-external 0
PREV == other-empty 274718693
PREV == other-other 40576461

DOING <_io.TextIOWrapper name='wv_lines.2016_03_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1345050536
PREV != Wikipedia 384721197
PREV == other-internal 190293
PREV == other-search 0
PREV == other-external 0
PREV == other-empty 361275167
PREV == other-other 23255737

DOING <_io.TextIOWrapper name='wv_lines.2016_04_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1224895287
PREV != Wikipedia 1016723738
PREV == other-internal 25621783
PREV == other-search 640761551
PREV == other-external 26412638
PREV == other-empty 323878099
PREV == other-other 49667

DOING <_io.TextIOWrapper name='wv_lines.2016_08_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1206267682
PREV != Wikipedia 985040545
PREV == other-internal 29268536
PREV == other-search 583945447
PREV == other-external 31137710
PREV == other-empty 340626516
PREV == other-other 62336

DOING <_io.TextIOWrapper name='wv_lines.2016_09_en_clickstream.tsv' mode='r' encoding='UTF-8'>
TOTAL SIZE: 1210079281
PREV != Wikipedia 1000837228
PREV == other-internal 28906136
PREV == other-search 611235854
PREV == other-external 30412054
PREV == other-empty 330189620
PREV == other-other 93564
	all
	full-list.txt
	new-clean-list.txt
	wv_lines.*.tsv
	counted.*.tsv
	__pycache__
	import sys
	import argparse
	import pandas as pd
	# import numpy as np
	# import matplotlib.pyplot as plt
	# import datetime
	# from dateutil.relativedelta import relativedelta
	# import matplotlib.dates as dates

	from util import *

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("tsv", type=argparse.FileType('r'), nargs='*',
	default=[sys.stdin],
	help="the TSV files for which to print a summary; " +
	"can read one file from stdin")
	parser.add_argument("--filter_list", type=argparse.FileType('r'),
	required=True,
	help="filename of filter list; each line in the file is a "
	"page title")
	parser.add_argument("--output_prefix", type=str,
	default="wv_lines.",
	help='prefix to use for output TSV files; ' +
	'defaults to "wv_lines."')
	args = parser.parse_args()
	wv_titles = set()
	for line in args.filter_list:
	title = line.strip().replace(' ', '_')
	wv_titles.add(title)
	for f in args.tsv:
	print("DOING", f)
	df = get_df(f)
	if "curr" in df:
	# For 2016 TSVs
	curr_col = "curr"
	else:
	# For 2015 TSVs
	curr_col = "curr_title"
	df[df[curr_col].isin(wv_titles)].to_csv(args.output_prefix + "%s" %
	f.name.split('/')[-1], sep="\t", index=False)

	if __name__ == "__main__":
	main()
	DOING <_io.TextIOWrapper name='all/2015_01_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 4036460178
	PREV != Wikipedia 2938673785
	PREV == other-wikipedia 99429039
	PREV == other-internal 224136
	PREV == other-google 1579464733
	PREV == other-yahoo 48987014
	PREV == other-bing 64418112
	PREV == other-facebook 2595491
	PREV == other-twitter 22517945
	PREV == other-empty 1037955460
	PREV == other 83081855

	DOING <_io.TextIOWrapper name='all/2015_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 3279134602
	PREV != Wikipedia 2107923648
	PREV == other-wikipedia 129619543
	PREV == other-internal 340707
	PREV == other-google 1494662520
	PREV == other-yahoo 48445941
	PREV == other-bing 65895496
	PREV == other-facebook 2312328
	PREV == other-twitter 19222486
	PREV == other-empty 347424627
	PREV == other 0

	DOING <_io.TextIOWrapper name='all/2016_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 6695402206
	PREV != Wikipedia 1770548247
	PREV == other-internal 623664
	PREV == other-search 0
	PREV == other-external 0
	PREV == other-empty 1648385145
	PREV == other-other 121539438

	DOING <_io.TextIOWrapper name='all/2016_03_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 6814906943
	PREV != Wikipedia 2263238989
	PREV == other-internal 545915
	PREV == other-search 0
	PREV == other-external 0
	PREV == other-empty 2156754015
	PREV == other-other 105939059

	DOING <_io.TextIOWrapper name='all/2016_04_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 6375609462
	PREV != Wikipedia 5211164904
	PREV == other-internal 107289998
	PREV == other-search 2921325072
	PREV == other-external 132805440
	PREV == other-empty 2049584695
	PREV == other-other 159699

	DOING <_io.TextIOWrapper name='all/2016_08_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 7502925411
	PREV != Wikipedia 6199514320
	PREV == other-internal 116612459
	PREV == other-search 2895236000
	PREV == other-external 158019950
	PREV == other-empty 3029460248
	PREV == other-other 185663

	DOING <_io.TextIOWrapper name='all/2016_09_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 6844156526
	PREV != Wikipedia 5614311204
	PREV == other-internal 116016558
	PREV == other-search 2979102403
	PREV == other-external 162445411
	PREV == other-empty 2356531976
	PREV == other-other 214856
	page_title sum_of_n desktop_pageviews mobile_pageviews Wikipedia other-empty other-external other-internal other-other other-search
	September 2016:
	Scala_(programming_language) 43506 36K 8.4K 8189 5951 1801 925 0 26640
	PHP 119222 94K 25K 11384 22426 4588 5318 44 75462
	Taylor_Swift 481458 196K 282K 65814 99622 5528 14926 0 295568
	Carly_Rae_Jepsen 74378 43K 33K 9653 15289 1259 2141 0 46036

	August 2016:
	Hillary_Clinton 963191 427K 476K 112048 222696 17963 27906 41 582537
	#!/usr/bin/env python3
	# License: CC0

	import sys


	SPECIAL_NAMES = [
	"other-internal",
	"other-search",
	"other-external",
	"other-empty",
	"other-other",
	]


	def grouped(prev):
	if prev in SPECIAL_NAMES:
	return prev
	return "Wikipedia"


	def main():
	current_curr = None
	counts = {}
	for line in sys.stdin:
	prev, curr, type_col, n_col = line.rstrip().split("\t")
	if curr == current_curr:
	counts[grouped(prev)] = counts.get(grouped(prev), 0) + int(n_col)
	else:
	# We are on to the next block of "curr" values, so first print the
	# existing sum. The "None" case happens on the very first line, so
	# don't print then.
	if current_curr is not None:
	for key in counts:
	print("{}\t{}\t{}".format(current_curr, key, counts[key]))
	current_curr = curr
	counts = {grouped(prev): int(n_col)}
	# Print the final "curr" group.
	for key in counts:
	print("{}\t{}\t{}".format(current_curr, key, counts[key]))


	if __name__ == "__main__":
	main()
	import pandas as pd

	def get_df(fname):
	df = pd.read_csv(fname, sep='\t')
	return df

	SPECIAL_NAMES = [
	"other-internal",
	"other-search",
	"other-external",
	"other-empty",
	"other-other",
	]

	SPECIAL_NAMES_2015 = [
	"other-wikipedia",
	"other-internal",
	"other-google",
	"other-yahoo",
	"other-bing",
	"other-facebook",
	"other-twitter",
	"other-empty",
	"other",
	]
	DOING <_io.TextIOWrapper name='wv_lines.2015_01_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 759804934
	PREV != Wikipedia 556163800
	PREV == other-wikipedia 24192548
	PREV == other-internal 77634
	PREV == other-google 357252461
	PREV == other-yahoo 13318672
	PREV == other-bing 16964243
	PREV == other-facebook 411390
	PREV == other-twitter 4664164
	PREV == other-empty 118854661
	PREV == other 20428027

	DOING <_io.TextIOWrapper name='wv_lines.2015_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 674625332
	PREV != Wikipedia 455598019
	PREV == other-wikipedia 29076280
	PREV == other-internal 113385
	PREV == other-google 339674595
	PREV == other-yahoo 13062424
	PREV == other-bing 17131432
	PREV == other-facebook 399093
	PREV == other-twitter 3181712
	PREV == other-empty 52959098
	PREV == other 0

	DOING <_io.TextIOWrapper name='wv_lines.2016_02_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 1379628229
	PREV != Wikipedia 315515278
	PREV == other-internal 220124
	PREV == other-search 0
	PREV == other-external 0
	PREV == other-empty 274718693
	PREV == other-other 40576461

	DOING <_io.TextIOWrapper name='wv_lines.2016_03_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 1345050536
	PREV != Wikipedia 384721197
	PREV == other-internal 190293
	PREV == other-search 0
	PREV == other-external 0
	PREV == other-empty 361275167
	PREV == other-other 23255737

	DOING <_io.TextIOWrapper name='wv_lines.2016_04_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 1224895287
	PREV != Wikipedia 1016723738
	PREV == other-internal 25621783
	PREV == other-search 640761551
	PREV == other-external 26412638
	PREV == other-empty 323878099
	PREV == other-other 49667

	DOING <_io.TextIOWrapper name='wv_lines.2016_08_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 1206267682
	PREV != Wikipedia 985040545
	PREV == other-internal 29268536
	PREV == other-search 583945447
	PREV == other-external 31137710
	PREV == other-empty 340626516
	PREV == other-other 62336

	DOING <_io.TextIOWrapper name='wv_lines.2016_09_en_clickstream.tsv' mode='r' encoding='UTF-8'>
	TOTAL SIZE: 1210079281
	PREV != Wikipedia 1000837228
	PREV == other-internal 28906136
	PREV == other-search 611235854
	PREV == other-external 30412054
	PREV == other-empty 330189620
	PREV == other-other 93564