Skip to content

Instantly share code, notes, and snippets.

@duhaime
Created September 9, 2021 16:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save duhaime/db54a10161112db6eabdcc0121bd5938 to your computer and use it in GitHub Desktop.
Save duhaime/db54a10161112db6eabdcc0121bd5938 to your computer and use it in GitHub Desktop.
get_wiki_people.py
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "8c055fde",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"\n",
"dataset = {}"
]
},
{
"cell_type": "markdown",
"id": "34e10acb",
"metadata": {},
"source": [
"# Get Text Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4d07f06",
"metadata": {},
"outputs": [],
"source": [
"from multiprocessing import Pool\n",
"import json\n",
"\n",
"def process_batch(l, idx):\n",
" '''Process a batch of records'''\n",
" with open(os.path.join(humans, 'humans-' + str(idx) + '.json'), 'w') as out:\n",
" json.dump(l, out)\n",
"\n",
"def keep_page(page):\n",
" '''Return a bool indicating if this page should be retained for analysis'''\n",
" return is_human_page(page)\n",
" \n",
"def is_human_page(page):\n",
" '''Return a bool indicating if this is a human page'''\n",
" if not 'births]]' in page: return False\n",
" if is_bad_page(page): return False\n",
" return True\n",
"\n",
"def is_bad_page(page):\n",
" '''Return a bool indicating if this page is whack'''\n",
" name = get_page_name(page)\n",
" if name.startswith('Wikipedia:'): return True\n",
" if name.startswith('Template:'): return True\n",
" if name.startswith('Draft:'): return True\n",
" return False\n",
"\n",
"def get_page_name(page):\n",
" '''Given a wikipedia page return the page name'''\n",
" return page.split('<title>')[1].split('</title>')[0]\n",
"\n",
"def get_page_text(page):\n",
" '''Given a wikipedia page in XML form return plaintext for text mining'''\n",
" try:\n",
" c = page.split('<text ')[1].split('>')[1].split('</text>')[0]\n",
" c = remove_markup(c)\n",
" c = BeautifulSoup(c).get_text()\n",
" c = c.replace('=', ' ')\n",
" c = c.lower()\n",
" c = c.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))\n",
" c = ' '.join(c.split())\n",
" return c\n",
" except Exception as exc:\n",
" print(exc)\n",
" return ''\n",
" \n",
"def parse_pages():\n",
" # downloaded from https://dumps.wikimedia.org/enwiki/20210820/enwiki-20210820-pages-articles-multistream.xml.bz2\n",
" # if that's gone, look in https://dumps.wikimedia.org/enwiki/ for a more recent export with same pages-articles-multistream pattern\n",
" with open(os.path.join('data', 'xml', 'enwiki-20210820-pages-articles-multistream.xml')) as f:\n",
" l = []\n",
" current = ''\n",
" headers_clipped = False\n",
" batch_count = 0\n",
" for idx, i in enumerate(f):\n",
" if '<page>' in i:\n",
" # we've accumulated a full page -- if it's a person page add it\n",
" if current: \n",
" if keep_page(current):\n",
" l.append(current)\n",
" current = ''\n",
" if len(l) >= 1000:\n",
" process_batch(l, batch_count)\n",
" l = []\n",
" batch_count += 1\n",
" if headers_clipped:\n",
" current += i\n",
" if '</siteinfo>' in i:\n",
" headers_clipped = True\n",
" process_batch(l, batch_count)\n",
" # print total processed\n",
" print(' * processed', batch_count * 1000)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "053d9a40",
"metadata": {},
"outputs": [],
"source": [
"from wikitextparser import remove_markup\n",
"from bs4 import BeautifulSoup\n",
"import hashlib\n",
"import string\n",
"import glob\n",
"import json\n",
"import html\n",
"import os\n",
"\n",
"for i in glob.glob(os.path.join('data', 'humans', '*')):\n",
" for j in json.load(open(i)):\n",
" if is_human_page(j):\n",
" name = get_page_name(j)\n",
" #text = get_page_text(j),\n",
" dataset[name] = {\n",
" 'raw': j,\n",
" 'length': len(j),\n",
" }\n",
" \n",
"with open(os.path.join('data', 'json', 'dataset.json'), 'w') as out:\n",
" json.dump(dataset, out)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c4a0f773",
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# histogram of bio lengths\n",
"l = [len(dataset[i]['raw']) for i in dataset if len(dataset[i]['raw']) < 20000]\n",
"\n",
"plt.hist(l, bins=100)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08e703be",
"metadata": {},
"outputs": [],
"source": [
"# find some example people at various thresholds of bio length\n",
"from collections import defaultdict\n",
"\n",
"d = defaultdict(list)\n",
"\n",
"for i in dataset:\n",
" biolen = dataset[i]['length']\n",
" lenbin = biolen // 5000\n",
" d[lenbin].append(i)\n",
" \n",
"sorted(d.keys(), reverse=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43c3a6a2",
"metadata": {},
"outputs": [],
"source": [
"# store the length bin of all people and delete those with short bios\n",
"keepers = set([j for i in d if i >= 2 for j in d[i]])\n",
"to_delete = []\n",
"for i in dataset:\n",
" if i not in keepers:\n",
" to_delete.append(i)\n",
" \n",
"for i in to_delete:\n",
" del dataset[i]"
]
},
{
"cell_type": "markdown",
"id": "77e8dcc9",
"metadata": {},
"source": [
"# Get Page Image and Bio"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "a756c930",
"metadata": {},
"outputs": [],
"source": [
"import json, os\n",
"\n",
"dataset = json.load(open(os.path.join('data', 'json', 'dataset.json')))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b7760210",
"metadata": {},
"outputs": [],
"source": [
"type(dataset), len(dataset)\n",
"\n",
"keys = list(dataset.keys())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "16b52133",
"metadata": {},
"outputs": [],
"source": [
"# get a smaller group to focus on\n",
"l = [i for i in keys if len(dataset[i]['raw']) >= 50000]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58050154",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 2021-09-09 09:21:50.938463\n",
"100 2021-09-09 09:22:49.699364\n",
"200 2021-09-09 09:23:56.098555\n",
"300 2021-09-09 09:24:56.744201\n",
"400 2021-09-09 09:25:57.821086\n",
"500 2021-09-09 09:26:57.883951\n",
"600 2021-09-09 09:27:59.442425\n",
"700 2021-09-09 09:29:01.890677\n",
"800 2021-09-09 09:30:01.504646\n",
"900 2021-09-09 09:31:00.793757\n",
"1000 2021-09-09 09:32:00.160513\n",
"1100 2021-09-09 09:33:00.319626\n",
"1200 2021-09-09 09:34:01.462002\n",
"1300 2021-09-09 09:35:00.213132\n",
"1400 2021-09-09 09:36:04.343532\n",
"1500 2021-09-09 09:37:03.728502\n",
"1600 2021-09-09 09:38:04.409510\n",
"1700 2021-09-09 09:39:05.645563\n",
"1800 2021-09-09 09:40:05.512104\n",
"1900 2021-09-09 09:41:04.983853\n",
"2000 2021-09-09 09:42:05.071490\n",
"2100 2021-09-09 09:43:04.250173\n",
"2200 2021-09-09 09:44:05.823320\n",
"2300 2021-09-09 09:45:05.642249\n",
"2400 2021-09-09 09:46:09.129941\n",
"2500 2021-09-09 09:47:10.542313\n",
"2600 2021-09-09 09:48:09.532581\n",
"2700 2021-09-09 09:49:16.028473\n",
"2800 2021-09-09 09:50:19.870628\n",
"2900 2021-09-09 09:51:24.437579\n",
"3000 2021-09-09 09:52:25.665282\n",
"3100 2021-09-09 09:53:30.297827\n",
"3200 2021-09-09 09:54:44.955433\n",
"3300 2021-09-09 09:55:52.429292\n",
"3400 2021-09-09 09:56:56.851773\n",
"3500 2021-09-09 09:58:04.255281\n",
"3600 2021-09-09 09:59:08.066332\n",
"3700 2021-09-09 10:00:13.600882\n",
"3800 2021-09-09 10:01:20.624927\n",
"3900 2021-09-09 10:02:25.553990\n",
"4000 2021-09-09 10:03:34.968171\n",
"4100 2021-09-09 10:04:37.125105\n",
"4200 2021-09-09 10:05:40.407464\n",
"4300 2021-09-09 10:06:45.158944\n",
"4400 2021-09-09 10:07:45.045416\n",
"4500 2021-09-09 10:09:04.535555\n",
"4600 2021-09-09 10:10:08.300479\n",
"4700 2021-09-09 10:11:09.304536\n",
"4800 2021-09-09 10:12:19.841213\n",
"4900 2021-09-09 10:13:18.449895\n",
"5000 2021-09-09 10:14:16.242837\n",
"5100 2021-09-09 10:15:13.686099\n",
"5200 2021-09-09 10:16:11.130482\n",
"5300 2021-09-09 10:17:12.401129\n",
"5400 2021-09-09 10:18:09.957899\n",
"5500 2021-09-09 10:19:07.357226\n",
"5600 2021-09-09 10:20:04.952373\n",
"5700 2021-09-09 10:21:04.335310\n",
"5800 2021-09-09 10:22:05.479684\n",
"5900 2021-09-09 10:23:02.874859\n",
"6000 2021-09-09 10:24:00.527678\n",
"6100 2021-09-09 10:24:58.551270\n",
"6200 2021-09-09 10:25:56.270927\n",
"6300 2021-09-09 10:26:55.179370\n",
"6400 2021-09-09 10:27:55.873549\n",
"6500 2021-09-09 10:28:52.673341\n",
"6600 2021-09-09 10:29:49.936690\n",
"6700 2021-09-09 10:30:47.451767\n",
"6800 2021-09-09 10:31:44.667065\n",
"6900 2021-09-09 10:33:19.921487\n",
"7000 2021-09-09 10:34:17.552654\n",
"7100 2021-09-09 10:36:21.059461\n",
"7200 2021-09-09 10:37:21.757798\n",
"7300 2021-09-09 10:38:18.303951\n",
"7400 2021-09-09 10:39:15.664794\n",
"7500 2021-09-09 10:40:12.685598\n",
"7600 2021-09-09 10:42:25.327668\n",
"7700 2021-09-09 10:43:22.774925\n",
"7800 2021-09-09 10:44:21.692950\n",
"7900 2021-09-09 10:45:18.834321\n",
"8000 2021-09-09 10:46:19.470151\n",
"8100 2021-09-09 10:47:17.386969\n",
"8200 2021-09-09 10:48:14.879007\n",
"8300 2021-09-09 10:49:15.259692\n",
"8400 2021-09-09 10:50:13.423401\n",
"8500 2021-09-09 10:51:13.040644\n",
"8600 2021-09-09 10:52:10.386851\n",
"8700 2021-09-09 10:53:08.048590\n",
"8800 2021-09-09 10:54:05.640330\n",
"8900 2021-09-09 10:55:06.592415\n"
]
}
],
"source": [
"from SPARQLWrapper import SPARQLWrapper, JSON\n",
"import html, datetime\n",
"\n",
"def format_page_name(s):\n",
" s = '_'.join(s.split())\n",
" #s = html.unescape(s)\n",
" return s\n",
"\n",
"def get_dbpedia_metadata(name):\n",
" '''Given a pagename get the dbpedia image and metadata'''\n",
" name = format_page_name(name)\n",
"\n",
" # nb: the following query can be tested on https://dbpedia.org/sparql\n",
" query = '''\n",
" prefix dbpedia: <http://dbpedia.org/resource/>\n",
" prefix dbpedia-owl: <http://dbpedia.org/ontology/>\n",
"\n",
" select ?abstract ?thumbnail where { \n",
" <http://dbpedia.org/resource/''' + name + '''> dbpedia-owl:abstract ?abstract ;\n",
" dbpedia-owl:thumbnail ?thumbnail .\n",
" filter(langMatches(lang(?abstract),\"en\"))\n",
" }\n",
" '''\n",
"\n",
" sparql = SPARQLWrapper(\"http://dbpedia.org/sparql\")\n",
" sparql.setReturnFormat(JSON)\n",
" sparql.setQuery(query)\n",
" j = sparql.query().convert()\n",
" try:\n",
" result = j['results']['bindings'][0]\n",
" return {\n",
" 'name': name,\n",
" 'abstract': result['abstract']['value'],\n",
" 'thumb': result['thumbnail']['value'],\n",
" }\n",
" except:\n",
" return None\n",
"\n",
"results = []\n",
"for idx, i in enumerate(l):\n",
" if idx%100 == 0: print(idx, datetime.datetime.now())\n",
" try:\n",
" result = get_dbpedia_metadata(i)\n",
" if not result: continue\n",
" d = dataset[i]\n",
" d.update(result)\n",
" results.append(d)\n",
" except:\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74048443",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "markdown",
"id": "af5b8371",
"metadata": {},
"source": [
"# Extract Faces"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9affe20c",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"from matplotlib import pyplot\n",
"from mtcnn.mtcnn import MTCNN\n",
"import os, json, glob, imageio\n",
"import numpy as np\n",
"\n",
"image_urls = json.load(open(os.path.join('data', 'json', 'image_urls.json')))\n",
"detector = MTCNN()\n",
"\n",
"for name in image_urls:\n",
" name = format_page_name(name)\n",
" images = glob.glob(os.path.join('data', 'images', name + '-*'))\n",
" print(name, 'has', len(images), 'images')\n",
" for image_idx, image in enumerate(images):\n",
" try:\n",
" im = pyplot.imread(image)[:,:,:3]\n",
" except:\n",
" os.remove(image)\n",
" continue\n",
" try:\n",
" results = detector.detect_faces(im)\n",
" for result_idx, result in enumerate(results):\n",
" confidence = result['confidence']\n",
" points = result['keypoints'] # each is x, y\n",
" x1, y1, width, height = result['box']\n",
" if confidence < 0.95: continue\n",
" if width < 80 or height < 80: continue\n",
" # identify keypoints\n",
" le_x, le_y = points['left_eye']\n",
" re_x, re_y = points['right_eye']\n",
" lm_x, lm_y = points['mouth_left']\n",
" rm_x, rm_y = points['mouth_right']\n",
" no_x, no_y = points['nose']\n",
" # get pitch, roll, yaw\n",
" roll = np.mean([le_y - re_y, lm_y - rm_y]) / height\n",
" yaw = (((le_x + re_x) / 2) - no_x) / width\n",
" # crop face\n",
" pad_top = int(height * 0.2)\n",
" pad_side = int(width * 0.2)\n",
" x2, y2 = x1 + width, y1 + height\n",
" face = im[y1-pad_top:y2+pad_top, x1-pad_side:x2+pad_side]\n",
" filename = '{}_{}_{}_{}_{}_{}.jpg'.format(\n",
" name, \n",
" image_idx, \n",
" result_idx, \n",
" round(confidence, 2),\n",
" round(roll, 2),\n",
" round(yaw, 2),\n",
" )\n",
" path = os.path.join('data', 'faces', filename)\n",
" imageio.imwrite(path, face)\n",
" except Exception as exc:\n",
" print(exc)\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4575c07",
"metadata": {},
"outputs": [],
"source": [
"from keras_vggface.vggface import VGGFace\n",
"import numpy as np\n",
"\n",
"# get face vector for comparison and identification of same person across multiple images\n",
"model = VGGFace(model='resnet50')\n",
"a = np.random.rand(1,224,224,3)\n",
"z = model.predict(a)\n",
"z.shape"
]
},
{
"cell_type": "markdown",
"id": "487e824e",
"metadata": {},
"source": [
"# Vectorize text"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7dda2fa",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.decomposition import NMF\n",
"\n",
"n_topics = 100 \n",
"word_scalar = 1000\n",
"\n",
"vectorizer = TfidfVectorizer(\n",
" input = 'content', \n",
" stop_words = 'english', \n",
" min_df = 5, \n",
" max_df = 0.8, \n",
" max_features = word_scalar * n_topics, \n",
")\n",
"\n",
"model = NMF(\n",
" n_components = n_topics,\n",
" random_state = 1,\n",
" verbose = 1,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd1e7f9d",
"metadata": {},
"outputs": [],
"source": [
"texts = [{\n",
" 'name': i,\n",
" 'text': dataset[i]['text'][:200] + '...',\n",
" 'lenbin': dataset[i]['lenbin'],\n",
"} for i in dataset]\n",
"\n",
"with open(os.path.join('data', 'json', 'texts.json'), 'w') as out:\n",
" json.dump(texts, out)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "380017f3",
"metadata": {},
"outputs": [],
"source": [
"l = [dataset[i]['text'] for i in dataset]\n",
"\n",
"# create the term document matrix\n",
"D = vectorizer.fit_transform(l)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "51e9ea02",
"metadata": {},
"outputs": [],
"source": [
"# optionally store the column labels (distinct terms in vectorizer model)\n",
"words = vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fda6f2f9",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"# Get matrix W above -- one row per document, one column per topic\n",
"W = model.fit_transform(D)\n",
"\n",
"# Get matrix T above -- one row per topic, one column per unique word\n",
"T = model.components_"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aaca2b23",
"metadata": {},
"outputs": [],
"source": [
"# rename D to `documents_by_terms`\n",
"documents_by_terms = D\n",
"\n",
"# rename W to `documents_by_topics`\n",
"documents_by_topics = W\n",
"\n",
"# rename T to `topics_by_terms`\n",
"topics_by_terms = T"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "64387ee3",
"metadata": {},
"outputs": [],
"source": [
"np.save(os.path.join('data', 'npy', 'documents-by-topics'), documents_by_topics)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "821b3499",
"metadata": {},
"outputs": [],
"source": [
"from umap import UMAP\n",
"\n",
"z = UMAP(n_neighbors=8).fit_transform(documents_by_topics)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e50c968",
"metadata": {},
"outputs": [],
"source": [
"colors = UMAP(n_neighbors=8, n_components=1).fit_transform(documents_by_topics)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff1f3c8d",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import numpy as np\n",
"\n",
"def scale(a):\n",
" return (a - np.min(a)) / (np.max(a) - np.min(a))\n",
"\n",
"scaled_positions = (scale(z) - 0.5) * 2.0\n",
"scaled_colors = scale(colors)\n",
"\n",
"with open(os.path.join('data', 'json', 'positions.json'), 'w') as out:\n",
" json.dump({\n",
" 'positions': scaled_positions.tolist(),\n",
" 'colors': scaled_colors.squeeze().tolist(),\n",
" }, out)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa809b93",
"metadata": {},
"outputs": [],
"source": [
"np.max(scaled_positions)"
]
},
{
"cell_type": "markdown",
"id": "2b7563b1",
"metadata": {},
"source": [
"# Unused: Page View Counts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8fe12f3d",
"metadata": {},
"outputs": [],
"source": [
"# from https://dumps.wikimedia.org/other/pagecounts-raw/2016/2016-01/\n",
"links = '''\n",
"pagecounts-20160101-000000.gz, size 66M\n",
"pagecounts-20160101-010000.gz, size 77M\n",
"pagecounts-20160101-020000.gz, size 72M\n",
"pagecounts-20160101-030000.gz, size 71M\n",
"pagecounts-20160101-040000.gz, size 69M\n",
"pagecounts-20160101-050000.gz, size 68M\n",
"pagecounts-20160101-060000.gz, size 69M\n",
"pagecounts-20160101-070000.gz, size 71M\n",
"pagecounts-20160101-080000.gz, size 73M\n",
"pagecounts-20160101-090000.gz, size 77M\n",
"pagecounts-20160101-100000.gz, size 83M\n",
"pagecounts-20160101-110000.gz, size 85M\n",
"pagecounts-20160101-120000.gz, size 85M\n",
"pagecounts-20160101-130000.gz, size 86M\n",
"pagecounts-20160101-140000.gz, size 84M\n",
"pagecounts-20160101-150000.gz, size 86M\n",
"pagecounts-20160101-160000.gz, size 89M\n",
"pagecounts-20160101-170000.gz, size 89M\n",
"pagecounts-20160101-180000.gz, size 93M\n",
"pagecounts-20160101-190000.gz, size 95M\n",
"pagecounts-20160101-200000.gz, size 92M\n",
"pagecounts-20160101-210000.gz, size 91M\n",
"pagecounts-20160101-220000.gz, size 92M\n",
"pagecounts-20160101-230000.gz, size 83M\n",
"pagecounts-20160102-000000.gz, size 81M\n",
"pagecounts-20160102-010000.gz, size 83M\n",
"pagecounts-20160102-020000.gz, size 76M\n",
"pagecounts-20160102-030000.gz, size 76M\n",
"pagecounts-20160102-040000.gz, size 75M\n",
"pagecounts-20160102-050000.gz, size 73M\n",
"pagecounts-20160102-060000.gz, size 72M\n",
"pagecounts-20160102-070000.gz, size 71M\n",
"pagecounts-20160102-080000.gz, size 75M\n",
"pagecounts-20160102-090000.gz, size 79M\n",
"pagecounts-20160102-100000.gz, size 84M\n",
"pagecounts-20160102-110000.gz, size 90M\n",
"pagecounts-20160102-120000.gz, size 90M\n",
"pagecounts-20160102-130000.gz, size 92M\n",
"pagecounts-20160102-140000.gz, size 95M\n",
"pagecounts-20160102-150000.gz, size 93M\n",
"pagecounts-20160102-160000.gz, size 94M\n",
"pagecounts-20160102-170000.gz, size 96M\n",
"pagecounts-20160102-180000.gz, size 95M\n",
"pagecounts-20160102-190000.gz, size 92M\n",
"pagecounts-20160102-200000.gz, size 92M\n",
"pagecounts-20160102-210000.gz, size 89M\n",
"pagecounts-20160102-220000.gz, size 87M\n",
"pagecounts-20160102-230000.gz, size 86M\n",
"pagecounts-20160103-000000.gz, size 82M\n",
"pagecounts-20160103-010000.gz, size 85M\n",
"pagecounts-20160103-020000.gz, size 80M\n",
"pagecounts-20160103-030000.gz, size 76M\n",
"pagecounts-20160103-040000.gz, size 75M\n",
"pagecounts-20160103-050000.gz, size 75M\n",
"pagecounts-20160103-060000.gz, size 77M\n",
"pagecounts-20160103-070000.gz, size 76M\n",
"pagecounts-20160103-080000.gz, size 80M\n",
"pagecounts-20160103-090000.gz, size 82M\n",
"pagecounts-20160103-100000.gz, size 88M\n",
"pagecounts-20160103-110000.gz, size 90M\n",
"pagecounts-20160103-120000.gz, size 92M\n",
"pagecounts-20160103-130000.gz, size 94M\n",
"pagecounts-20160103-140000.gz, size 94M\n",
"pagecounts-20160103-150000.gz, size 95M\n",
"pagecounts-20160103-160000.gz, size 93M\n",
"pagecounts-20160103-170000.gz, size 93M\n",
"pagecounts-20160103-180000.gz, size 94M\n",
"pagecounts-20160103-190000.gz, size 92M\n",
"pagecounts-20160103-200000.gz, size 91M\n",
"pagecounts-20160103-210000.gz, size 90M\n",
"pagecounts-20160103-220000.gz, size 90M\n",
"pagecounts-20160103-230000.gz, size 86M\n",
"pagecounts-20160104-000000.gz, size 80M\n",
"pagecounts-20160104-010000.gz, size 83M\n",
"pagecounts-20160104-020000.gz, size 83M\n",
"pagecounts-20160104-030000.gz, size 81M\n",
"pagecounts-20160104-040000.gz, size 79M\n",
"pagecounts-20160104-050000.gz, size 79M\n",
"pagecounts-20160104-060000.gz, size 79M\n",
"pagecounts-20160104-070000.gz, size 81M\n",
"pagecounts-20160104-080000.gz, size 84M\n",
"pagecounts-20160104-090000.gz, size 89M\n",
"pagecounts-20160104-100000.gz, size 94M\n",
"pagecounts-20160104-110000.gz, size 96M\n",
"pagecounts-20160104-120000.gz, size 92M\n",
"pagecounts-20160104-130000.gz, size 93M\n",
"pagecounts-20160104-140000.gz, size 98M\n",
"pagecounts-20160104-150000.gz, size 101M\n",
"pagecounts-20160104-160000.gz, size 102M\n",
"pagecounts-20160104-170000.gz, size 98M\n",
"pagecounts-20160104-180000.gz, size 98M\n",
"pagecounts-20160104-190000.gz, size 98M\n",
"pagecounts-20160104-200000.gz, size 96M\n",
"pagecounts-20160104-210000.gz, size 96M\n",
"pagecounts-20160104-220000.gz, size 96M\n",
"pagecounts-20160104-230000.gz, size 93M\n",
"pagecounts-20160105-000000.gz, size 88M\n",
"pagecounts-20160105-010000.gz, size 90M\n",
"pagecounts-20160105-020000.gz, size 84M\n",
"pagecounts-20160105-030000.gz, size 81M\n",
"pagecounts-20160105-040000.gz, size 81M\n",
"pagecounts-20160105-050000.gz, size 82M\n",
"pagecounts-20160105-060000.gz, size 85M\n",
"pagecounts-20160105-070000.gz, size 85M\n",
"pagecounts-20160105-080000.gz, size 90M\n",
"pagecounts-20160105-090000.gz, size 100M\n",
"pagecounts-20160105-100000.gz, size 103M\n",
"pagecounts-20160105-110000.gz, size 105M\n",
"pagecounts-20160105-120000.gz, size 105M\n",
"pagecounts-20160105-130000.gz, size 104M\n",
"pagecounts-20160105-140000.gz, size 106M\n",
"pagecounts-20160105-150000.gz, size 107M\n",
"pagecounts-20160105-160000.gz, size 106M\n",
"pagecounts-20160105-170000.gz, size 106M\n",
"pagecounts-20160105-180000.gz, size 105M\n",
"pagecounts-20160105-190000.gz, size 110M\n",
"pagecounts-20160105-200000.gz, size 112M\n",
"pagecounts-20160105-210000.gz, size 107M\n",
"pagecounts-20160105-220000.gz, size 110M\n",
"pagecounts-20160105-230000.gz, size 112M\n",
"pagecounts-20160106-000000.gz, size 102M\n",
"pagecounts-20160106-010000.gz, size 102M\n",
"pagecounts-20160106-020000.gz, size 93M\n",
"pagecounts-20160106-030000.gz, size 88M\n",
"pagecounts-20160106-040000.gz, size 87M\n",
"pagecounts-20160106-050000.gz, size 84M\n",
"pagecounts-20160106-060000.gz, size 90M\n",
"pagecounts-20160106-070000.gz, size 94M\n",
"pagecounts-20160106-080000.gz, size 99M\n",
"pagecounts-20160106-090000.gz, size 102M\n",
"pagecounts-20160106-100000.gz, size 104M\n",
"pagecounts-20160106-110000.gz, size 105M\n",
"pagecounts-20160106-120000.gz, size 104M\n",
"pagecounts-20160106-130000.gz, size 101M\n",
"pagecounts-20160106-140000.gz, size 104M\n",
"pagecounts-20160106-150000.gz, size 104M\n",
"pagecounts-20160106-160000.gz, size 103M\n",
"pagecounts-20160106-170000.gz, size 101M\n",
"pagecounts-20160106-180000.gz, size 101M\n",
"pagecounts-20160106-190000.gz, size 107M\n",
"pagecounts-20160106-200000.gz, size 105M\n",
"pagecounts-20160106-210000.gz, size 101M\n",
"pagecounts-20160106-220000.gz, size 103M\n",
"pagecounts-20160106-230000.gz, size 101M\n",
"pagecounts-20160107-000000.gz, size 92M\n",
"pagecounts-20160107-010000.gz, size 92M\n",
"pagecounts-20160107-020000.gz, size 90M\n",
"pagecounts-20160107-030000.gz, size 86M\n",
"pagecounts-20160107-040000.gz, size 85M\n",
"pagecounts-20160107-050000.gz, size 83M\n",
"pagecounts-20160107-060000.gz, size 83M\n",
"pagecounts-20160107-070000.gz, size 83M\n",
"pagecounts-20160107-080000.gz, size 90M\n",
"pagecounts-20160107-090000.gz, size 96M\n",
"pagecounts-20160107-100000.gz, size 99M\n",
"pagecounts-20160107-110000.gz, size 104M\n",
"pagecounts-20160107-120000.gz, size 103M\n",
"pagecounts-20160107-130000.gz, size 103M\n",
"pagecounts-20160107-140000.gz, size 104M\n",
"pagecounts-20160107-150000.gz, size 105M\n",
"pagecounts-20160107-160000.gz, size 108M\n",
"pagecounts-20160107-170000.gz, size 107M\n",
"pagecounts-20160107-180000.gz, size 106M\n",
"pagecounts-20160107-190000.gz, size 108M\n",
"pagecounts-20160107-200000.gz, size 106M\n",
"pagecounts-20160107-210000.gz, size 101M\n",
"pagecounts-20160107-220000.gz, size 104M\n",
"pagecounts-20160107-230000.gz, size 97M\n",
"pagecounts-20160108-000000.gz, size 91M\n",
"pagecounts-20160108-010000.gz, size 96M\n",
"pagecounts-20160108-020000.gz, size 90M\n",
"pagecounts-20160108-030000.gz, size 86M\n",
"pagecounts-20160108-040000.gz, size 86M\n",
"pagecounts-20160108-050000.gz, size 85M\n",
"pagecounts-20160108-060000.gz, size 89M\n",
"pagecounts-20160108-070000.gz, size 89M\n",
"pagecounts-20160108-080000.gz, size 96M\n",
"pagecounts-20160108-090000.gz, size 99M\n",
"pagecounts-20160108-100000.gz, size 100M\n",
"pagecounts-20160108-110000.gz, size 98M\n",
"pagecounts-20160108-120000.gz, size 97M\n",
"pagecounts-20160108-130000.gz, size 96M\n",
"pagecounts-20160108-140000.gz, size 100M\n",
"pagecounts-20160108-150000.gz, size 104M\n",
"pagecounts-20160108-160000.gz, size 103M\n",
"pagecounts-20160108-170000.gz, size 106M\n",
"pagecounts-20160108-180000.gz, size 103M\n",
"pagecounts-20160108-190000.gz, size 103M\n",
"pagecounts-20160108-200000.gz, size 102M\n",
"pagecounts-20160108-210000.gz, size 97M\n",
"pagecounts-20160108-220000.gz, size 95M\n",
"pagecounts-20160108-230000.gz, size 96M\n",
"pagecounts-20160109-000000.gz, size 89M\n",
"pagecounts-20160109-010000.gz, size 92M\n",
"pagecounts-20160109-020000.gz, size 86M\n",
"pagecounts-20160109-030000.gz, size 79M\n",
"pagecounts-20160109-040000.gz, size 75M\n",
"pagecounts-20160109-050000.gz, size 77M\n",
"pagecounts-20160109-060000.gz, size 75M\n",
"pagecounts-20160109-070000.gz, size 77M\n",
"pagecounts-20160109-080000.gz, size 79M\n",
"pagecounts-20160109-090000.gz, size 86M\n",
"pagecounts-20160109-100000.gz, size 92M\n",
"pagecounts-20160109-110000.gz, size 96M\n",
"pagecounts-20160109-120000.gz, size 98M\n",
"pagecounts-20160109-130000.gz, size 94M\n",
"pagecounts-20160109-140000.gz, size 96M\n",
"pagecounts-20160109-150000.gz, size 96M\n",
"pagecounts-20160109-160000.gz, size 98M\n",
"pagecounts-20160109-170000.gz, size 98M\n",
"pagecounts-20160109-180000.gz, size 99M\n",
"pagecounts-20160109-190000.gz, size 98M\n",
"pagecounts-20160109-200000.gz, size 98M\n",
"pagecounts-20160109-210000.gz, size 95M\n",
"pagecounts-20160109-220000.gz, size 93M\n",
"pagecounts-20160109-230000.gz, size 91M\n",
"pagecounts-20160110-000000.gz, size 87M\n",
"pagecounts-20160110-010000.gz, size 86M\n",
"pagecounts-20160110-020000.gz, size 84M\n",
"pagecounts-20160110-030000.gz, size 79M\n",
"pagecounts-20160110-040000.gz, size 78M\n",
"pagecounts-20160110-050000.gz, size 78M\n",
"pagecounts-20160110-060000.gz, size 76M\n",
"pagecounts-20160110-070000.gz, size 81M\n",
"pagecounts-20160110-080000.gz, size 82M\n",
"pagecounts-20160110-090000.gz, size 85M\n",
"pagecounts-20160110-100000.gz, size 91M\n",
"pagecounts-20160110-110000.gz, size 93M\n",
"pagecounts-20160110-120000.gz, size 93M\n",
"pagecounts-20160110-130000.gz, size 92M\n",
"pagecounts-20160110-140000.gz, size 93M\n",
"pagecounts-20160110-150000.gz, size 96M\n",
"pagecounts-20160110-160000.gz, size 98M\n",
"pagecounts-20160110-170000.gz, size 97M\n",
"pagecounts-20160110-180000.gz, size 97M\n",
"pagecounts-20160110-190000.gz, size 99M\n",
"pagecounts-20160110-200000.gz, size 99M\n",
"pagecounts-20160110-210000.gz, size 93M\n",
"pagecounts-20160110-220000.gz, size 91M\n",
"pagecounts-20160110-230000.gz, size 90M\n",
"pagecounts-20160111-000000.gz, size 82M\n",
"pagecounts-20160111-010000.gz, size 83M\n",
"pagecounts-20160111-020000.gz, size 79M\n",
"pagecounts-20160111-030000.gz, size 80M\n",
"pagecounts-20160111-040000.gz, size 82M\n",
"pagecounts-20160111-050000.gz, size 83M\n",
"pagecounts-20160111-060000.gz, size 82M\n",
"pagecounts-20160111-070000.gz, size 86M\n",
"pagecounts-20160111-080000.gz, size 90M\n",
"pagecounts-20160111-090000.gz, size 95M\n",
"pagecounts-20160111-100000.gz, size 102M\n",
"pagecounts-20160111-110000.gz, size 96M\n",
"pagecounts-20160111-120000.gz, size 100M\n",
"pagecounts-20160111-130000.gz, size 98M\n",
"pagecounts-20160111-140000.gz, size 101M\n",
"pagecounts-20160111-150000.gz, size 102M\n",
"pagecounts-20160111-160000.gz, size 102M\n",
"pagecounts-20160111-170000.gz, size 101M\n",
"pagecounts-20160111-180000.gz, size 99M\n",
"pagecounts-20160111-190000.gz, size 99M\n",
"pagecounts-20160111-200000.gz, size 98M\n",
"pagecounts-20160111-210000.gz, size 99M\n",
"pagecounts-20160111-220000.gz, size 95M\n",
"pagecounts-20160111-230000.gz, size 91M\n",
"pagecounts-20160112-000000.gz, size 91M\n",
"pagecounts-20160112-010000.gz, size 91M\n",
"pagecounts-20160112-020000.gz, size 84M\n",
"pagecounts-20160112-030000.gz, size 80M\n",
"pagecounts-20160112-040000.gz, size 81M\n",
"pagecounts-20160112-050000.gz, size 82M\n",
"pagecounts-20160112-060000.gz, size 81M\n",
"pagecounts-20160112-070000.gz, size 83M\n",
"pagecounts-20160112-080000.gz, size 90M\n",
"pagecounts-20160112-090000.gz, size 93M\n",
"pagecounts-20160112-100000.gz, size 96M\n",
"pagecounts-20160112-110000.gz, size 99M\n",
"pagecounts-20160112-120000.gz, size 97M\n",
"pagecounts-20160112-130000.gz, size 95M\n",
"pagecounts-20160112-140000.gz, size 96M\n",
"pagecounts-20160112-150000.gz, size 101M\n",
"pagecounts-20160112-160000.gz, size 99M\n",
"pagecounts-20160112-170000.gz, size 97M\n",
"pagecounts-20160112-180000.gz, size 99M\n",
"pagecounts-20160112-190000.gz, size 97M\n",
"pagecounts-20160112-200000.gz, size 98M\n",
"pagecounts-20160112-210000.gz, size 95M\n",
"pagecounts-20160112-220000.gz, size 97M\n",
"pagecounts-20160112-230000.gz, size 90M\n",
"pagecounts-20160113-000000.gz, size 84M\n",
"pagecounts-20160113-010000.gz, size 86M\n",
"pagecounts-20160113-020000.gz, size 85M\n",
"pagecounts-20160113-030000.gz, size 84M\n",
"pagecounts-20160113-040000.gz, size 81M\n",
"pagecounts-20160113-050000.gz, size 80M\n",
"pagecounts-20160113-060000.gz, size 71M\n",
"pagecounts-20160113-070000.gz, size 77M\n",
"pagecounts-20160113-080000.gz, size 81M\n",
"pagecounts-20160113-090000.gz, size 85M\n",
"pagecounts-20160113-100000.gz, size 90M\n",
"pagecounts-20160113-110000.gz, size 92M\n",
"pagecounts-20160113-120000.gz, size 91M\n",
"pagecounts-20160113-130000.gz, size 92M\n",
"pagecounts-20160113-140000.gz, size 100M\n",
"pagecounts-20160113-150000.gz, size 103M\n",
"pagecounts-20160113-160000.gz, size 103M\n",
"pagecounts-20160113-170000.gz, size 101M\n",
"pagecounts-20160113-180000.gz, size 101M\n",
"pagecounts-20160113-190000.gz, size 101M\n",
"pagecounts-20160113-200000.gz, size 99M\n",
"pagecounts-20160113-210000.gz, size 98M\n",
"pagecounts-20160113-220000.gz, size 97M\n",
"pagecounts-20160113-230000.gz, size 93M\n",
"pagecounts-20160114-000000.gz, size 83M\n",
"pagecounts-20160114-010000.gz, size 87M\n",
"pagecounts-20160114-020000.gz, size 85M\n",
"pagecounts-20160114-030000.gz, size 79M\n",
"pagecounts-20160114-040000.gz, size 80M\n",
"pagecounts-20160114-050000.gz, size 82M\n",
"pagecounts-20160114-060000.gz, size 86M\n",
"pagecounts-20160114-070000.gz, size 85M\n",
"pagecounts-20160114-080000.gz, size 92M\n",
"pagecounts-20160114-090000.gz, size 96M\n",
"pagecounts-20160114-100000.gz, size 100M\n",
"pagecounts-20160114-110000.gz, size 101M\n",
"pagecounts-20160114-120000.gz, size 100M\n",
"pagecounts-20160114-130000.gz, size 97M\n",
"pagecounts-20160114-140000.gz, size 102M\n",
"pagecounts-20160114-150000.gz, size 103M\n",
"pagecounts-20160114-160000.gz, size 100M\n",
"pagecounts-20160114-170000.gz, size 100M\n",
"pagecounts-20160114-180000.gz, size 101M\n",
"pagecounts-20160114-190000.gz, size 102M\n",
"pagecounts-20160114-200000.gz, size 102M\n",
"pagecounts-20160114-210000.gz, size 102M\n",
"pagecounts-20160114-220000.gz, size 98M\n",
"pagecounts-20160114-230000.gz, size 92M\n",
"pagecounts-20160115-000000.gz, size 86M\n",
"pagecounts-20160115-010000.gz, size 88M\n",
"pagecounts-20160115-020000.gz, size 86M\n",
"pagecounts-20160115-030000.gz, size 82M\n",
"pagecounts-20160115-040000.gz, size 81M\n",
"pagecounts-20160115-050000.gz, size 80M\n",
"pagecounts-20160115-060000.gz, size 80M\n",
"pagecounts-20160115-070000.gz, size 80M\n",
"pagecounts-20160115-080000.gz, size 88M\n",
"pagecounts-20160115-090000.gz, size 93M\n",
"pagecounts-20160115-100000.gz, size 94M\n",
"pagecounts-20160115-110000.gz, size 92M\n",
"pagecounts-20160115-120000.gz, size 92M\n",
"pagecounts-20160115-130000.gz, size 90M\n",
"pagecounts-20160115-140000.gz, size 94M\n",
"pagecounts-20160115-150000.gz, size 99M\n",
"pagecounts-20160115-160000.gz, size 99M\n",
"pagecounts-20160115-170000.gz, size 98M\n",
"pagecounts-20160115-180000.gz, size 95M\n",
"pagecounts-20160115-190000.gz, size 95M\n",
"pagecounts-20160115-200000.gz, size 95M\n",
"pagecounts-20160115-210000.gz, size 92M\n",
"pagecounts-20160115-220000.gz, size 89M\n",
"pagecounts-20160115-230000.gz, size 88M\n",
"pagecounts-20160116-000000.gz, size 79M\n",
"pagecounts-20160116-010000.gz, size 79M\n",
"pagecounts-20160116-020000.gz, size 77M\n",
"pagecounts-20160116-030000.gz, size 75M\n",
"pagecounts-20160116-040000.gz, size 77M\n",
"pagecounts-20160116-050000.gz, size 79M\n",
"pagecounts-20160116-060000.gz, size 81M\n",
"pagecounts-20160116-070000.gz, size 82M\n",
"pagecounts-20160116-080000.gz, size 82M\n",
"pagecounts-20160116-090000.gz, size 84M\n",
"pagecounts-20160116-100000.gz, size 90M\n",
"pagecounts-20160116-110000.gz, size 91M\n",
"pagecounts-20160116-120000.gz, size 89M\n",
"pagecounts-20160116-130000.gz, size 89M\n",
"pagecounts-20160116-140000.gz, size 92M\n",
"pagecounts-20160116-150000.gz, size 93M\n",
"pagecounts-20160116-160000.gz, size 95M\n",
"pagecounts-20160116-170000.gz, size 92M\n",
"pagecounts-20160116-180000.gz, size 94M\n",
"pagecounts-20160116-190000.gz, size 92M\n",
"pagecounts-20160116-200000.gz, size 89M\n",
"pagecounts-20160116-210000.gz, size 90M\n",
"pagecounts-20160116-220000.gz, size 89M\n",
"pagecounts-20160116-230000.gz, size 89M\n",
"pagecounts-20160117-000000.gz, size 83M\n",
"pagecounts-20160117-010000.gz, size 83M\n",
"pagecounts-20160117-020000.gz, size 81M\n",
"pagecounts-20160117-030000.gz, size 77M\n",
"pagecounts-20160117-040000.gz, size 74M\n",
"pagecounts-20160117-050000.gz, size 77M\n",
"pagecounts-20160117-060000.gz, size 76M\n",
"pagecounts-20160117-070000.gz, size 78M\n",
"pagecounts-20160117-080000.gz, size 83M\n",
"pagecounts-20160117-090000.gz, size 88M\n",
"pagecounts-20160117-100000.gz, size 91M\n",
"pagecounts-20160117-110000.gz, size 94M\n",
"pagecounts-20160117-120000.gz, size 93M\n",
"pagecounts-20160117-130000.gz, size 89M\n",
"pagecounts-20160117-140000.gz, size 95M\n",
"pagecounts-20160117-150000.gz, size 100M\n",
"pagecounts-20160117-160000.gz, size 99M\n",
"pagecounts-20160117-170000.gz, size 94M\n",
"pagecounts-20160117-180000.gz, size 95M\n",
"pagecounts-20160117-190000.gz, size 91M\n",
"pagecounts-20160117-200000.gz, size 91M\n",
"pagecounts-20160117-210000.gz, size 88M\n",
"pagecounts-20160117-220000.gz, size 86M\n",
"pagecounts-20160117-230000.gz, size 86M\n",
"pagecounts-20160118-000000.gz, size 85M\n",
"pagecounts-20160118-010000.gz, size 88M\n",
"pagecounts-20160118-020000.gz, size 85M\n",
"pagecounts-20160118-030000.gz, size 81M\n",
"pagecounts-20160118-040000.gz, size 83M\n",
"pagecounts-20160118-050000.gz, size 83M\n",
"pagecounts-20160118-060000.gz, size 84M\n",
"pagecounts-20160118-070000.gz, size 83M\n",
"pagecounts-20160118-080000.gz, size 89M\n",
"pagecounts-20160118-090000.gz, size 96M\n",
"pagecounts-20160118-100000.gz, size 101M\n",
"pagecounts-20160118-110000.gz, size 100M\n",
"pagecounts-20160118-120000.gz, size 98M\n",
"pagecounts-20160118-130000.gz, size 95M\n",
"pagecounts-20160118-140000.gz, size 100M\n",
"pagecounts-20160118-150000.gz, size 104M\n",
"pagecounts-20160118-160000.gz, size 104M\n",
"pagecounts-20160118-170000.gz, size 103M\n",
"pagecounts-20160118-180000.gz, size 103M\n",
"pagecounts-20160118-190000.gz, size 102M\n",
"pagecounts-20160118-200000.gz, size 99M\n",
"pagecounts-20160118-210000.gz, size 98M\n",
"pagecounts-20160118-220000.gz, size 96M\n",
"pagecounts-20160118-230000.gz, size 90M\n",
"pagecounts-20160119-000000.gz, size 88M\n",
"pagecounts-20160119-010000.gz, size 89M\n",
"pagecounts-20160119-020000.gz, size 86M\n",
"pagecounts-20160119-030000.gz, size 81M\n",
"pagecounts-20160119-040000.gz, size 80M\n",
"pagecounts-20160119-050000.gz, size 81M\n",
"pagecounts-20160119-060000.gz, size 85M\n",
"pagecounts-20160119-070000.gz, size 87M\n",
"pagecounts-20160119-080000.gz, size 94M\n",
"pagecounts-20160119-090000.gz, size 97M\n",
"pagecounts-20160119-100000.gz, size 102M\n",
"pagecounts-20160119-110000.gz, size 100M\n",
"pagecounts-20160119-120000.gz, size 98M\n",
"pagecounts-20160119-130000.gz, size 97M\n",
"pagecounts-20160119-140000.gz, size 101M\n",
"pagecounts-20160119-150000.gz, size 102M\n",
"pagecounts-20160119-160000.gz, size 101M\n",
"pagecounts-20160119-170000.gz, size 97M\n",
"pagecounts-20160119-180000.gz, size 95M\n",
"pagecounts-20160119-190000.gz, size 95M\n",
"pagecounts-20160119-200000.gz, size 95M\n",
"pagecounts-20160119-210000.gz, size 96M\n",
"pagecounts-20160119-220000.gz, size 93M\n",
"pagecounts-20160119-230000.gz, size 89M\n",
"pagecounts-20160120-000000.gz, size 82M\n",
"pagecounts-20160120-010000.gz, size 84M\n",
"pagecounts-20160120-020000.gz, size 83M\n",
"pagecounts-20160120-030000.gz, size 79M\n",
"pagecounts-20160120-040000.gz, size 77M\n",
"pagecounts-20160120-050000.gz, size 76M\n",
"pagecounts-20160120-060000.gz, size 81M\n",
"pagecounts-20160120-070000.gz, size 83M\n",
"pagecounts-20160120-080000.gz, size 88M\n",
"pagecounts-20160120-090000.gz, size 97M\n",
"pagecounts-20160120-100000.gz, size 100M\n",
"pagecounts-20160120-110000.gz, size 103M\n",
"pagecounts-20160120-120000.gz, size 100M\n",
"pagecounts-20160120-130000.gz, size 96M\n",
"pagecounts-20160120-140000.gz, size 97M\n",
"pagecounts-20160120-150000.gz, size 100M\n",
"pagecounts-20160120-160000.gz, size 101M\n",
"pagecounts-20160120-170000.gz, size 102M\n",
"pagecounts-20160120-180000.gz, size 98M\n",
"pagecounts-20160120-190000.gz, size 97M\n",
"pagecounts-20160120-200000.gz, size 96M\n",
"pagecounts-20160120-210000.gz, size 94M\n",
"pagecounts-20160120-220000.gz, size 90M\n",
"pagecounts-20160120-230000.gz, size 84M\n",
"pagecounts-20160121-000000.gz, size 81M\n",
"pagecounts-20160121-010000.gz, size 88M\n",
"pagecounts-20160121-020000.gz, size 83M\n",
"pagecounts-20160121-030000.gz, size 82M\n",
"pagecounts-20160121-040000.gz, size 83M\n",
"pagecounts-20160121-050000.gz, size 81M\n",
"pagecounts-20160121-060000.gz, size 82M\n",
"pagecounts-20160121-070000.gz, size 86M\n",
"pagecounts-20160121-080000.gz, size 90M\n",
"pagecounts-20160121-090000.gz, size 95M\n",
"pagecounts-20160121-100000.gz, size 98M\n",
"pagecounts-20160121-110000.gz, size 101M\n",
"pagecounts-20160121-120000.gz, size 99M\n",
"pagecounts-20160121-130000.gz, size 95M\n",
"pagecounts-20160121-140000.gz, size 98M\n",
"pagecounts-20160121-150000.gz, size 99M\n",
"pagecounts-20160121-160000.gz, size 99M\n",
"pagecounts-20160121-170000.gz, size 98M\n",
"pagecounts-20160121-180000.gz, size 97M\n",
"pagecounts-20160121-190000.gz, size 95M\n",
"pagecounts-20160121-200000.gz, size 92M\n",
"pagecounts-20160121-210000.gz, size 92M\n",
"pagecounts-20160121-220000.gz, size 95M\n",
"pagecounts-20160121-230000.gz, size 91M\n",
"pagecounts-20160122-000000.gz, size 88M\n",
"pagecounts-20160122-010000.gz, size 95M\n",
"pagecounts-20160122-020000.gz, size 89M\n",
"pagecounts-20160122-030000.gz, size 87M\n",
"pagecounts-20160122-040000.gz, size 84M\n",
"pagecounts-20160122-050000.gz, size 82M\n",
"pagecounts-20160122-060000.gz, size 84M\n",
"pagecounts-20160122-070000.gz, size 85M\n",
"pagecounts-20160122-080000.gz, size 90M\n",
"pagecounts-20160122-090000.gz, size 92M\n",
"pagecounts-20160122-100000.gz, size 99M\n",
"pagecounts-20160122-110000.gz, size 99M\n",
"pagecounts-20160122-120000.gz, size 98M\n",
"pagecounts-20160122-130000.gz, size 92M\n",
"pagecounts-20160122-140000.gz, size 97M\n",
"pagecounts-20160122-150000.gz, size 100M\n",
"pagecounts-20160122-160000.gz, size 97M\n",
"pagecounts-20160122-170000.gz, size 95M\n",
"pagecounts-20160122-180000.gz, size 94M\n",
"pagecounts-20160122-190000.gz, size 93M\n",
"pagecounts-20160122-200000.gz, size 89M\n",
"pagecounts-20160122-210000.gz, size 86M\n",
"pagecounts-20160122-220000.gz, size 82M\n",
"pagecounts-20160122-230000.gz, size 82M\n",
"pagecounts-20160123-000000.gz, size 78M\n",
"pagecounts-20160123-010000.gz, size 83M\n",
"pagecounts-20160123-020000.gz, size 84M\n",
"pagecounts-20160123-030000.gz, size 79M\n",
"pagecounts-20160123-040000.gz, size 81M\n",
"pagecounts-20160123-050000.gz, size 83M\n",
"pagecounts-20160123-060000.gz, size 83M\n",
"pagecounts-20160123-070000.gz, size 82M\n",
"pagecounts-20160123-080000.gz, size 86M\n",
"pagecounts-20160123-090000.gz, size 89M\n",
"pagecounts-20160123-100000.gz, size 93M\n",
"pagecounts-20160123-110000.gz, size 93M\n",
"pagecounts-20160123-120000.gz, size 94M\n",
"pagecounts-20160123-130000.gz, size 93M\n",
"pagecounts-20160123-140000.gz, size 95M\n",
"pagecounts-20160123-150000.gz, size 95M\n",
"pagecounts-20160123-160000.gz, size 96M\n",
"pagecounts-20160123-170000.gz, size 99M\n",
"pagecounts-20160123-180000.gz, size 100M\n",
"pagecounts-20160123-190000.gz, size 97M\n",
"pagecounts-20160123-200000.gz, size 95M\n",
"pagecounts-20160123-210000.gz, size 92M\n",
"pagecounts-20160123-220000.gz, size 92M\n",
"pagecounts-20160123-230000.gz, size 91M\n",
"pagecounts-20160124-000000.gz, size 85M\n",
"pagecounts-20160124-010000.gz, size 91M\n",
"pagecounts-20160124-020000.gz, size 83M\n",
"pagecounts-20160124-030000.gz, size 82M\n",
"pagecounts-20160124-040000.gz, size 81M\n",
"pagecounts-20160124-050000.gz, size 86M\n",
"pagecounts-20160124-060000.gz, size 85M\n",
"pagecounts-20160124-070000.gz, size 85M\n",
"pagecounts-20160124-080000.gz, size 86M\n",
"pagecounts-20160124-090000.gz, size 89M\n",
"pagecounts-20160124-100000.gz, size 96M\n",
"pagecounts-20160124-110000.gz, size 96M\n",
"pagecounts-20160124-120000.gz, size 96M\n",
"pagecounts-20160124-130000.gz, size 93M\n",
"pagecounts-20160124-140000.gz, size 98M\n",
"pagecounts-20160124-150000.gz, size 99M\n",
"pagecounts-20160124-160000.gz, size 98M\n",
"pagecounts-20160124-170000.gz, size 96M\n",
"pagecounts-20160124-180000.gz, size 94M\n",
"pagecounts-20160124-190000.gz, size 96M\n",
"pagecounts-20160124-200000.gz, size 96M\n",
"pagecounts-20160124-210000.gz, size 93M\n",
"pagecounts-20160124-220000.gz, size 90M\n",
"pagecounts-20160124-230000.gz, size 87M\n",
"pagecounts-20160125-000000.gz, size 82M\n",
"pagecounts-20160125-010000.gz, size 85M\n",
"pagecounts-20160125-020000.gz, size 83M\n",
"pagecounts-20160125-030000.gz, size 79M\n",
"pagecounts-20160125-040000.gz, size 74M\n",
"pagecounts-20160125-050000.gz, size 73M\n",
"pagecounts-20160125-060000.gz, size 75M\n",
"pagecounts-20160125-070000.gz, size 82M\n",
"pagecounts-20160125-080000.gz, size 89M\n",
"pagecounts-20160125-090000.gz, size 94M\n",
"pagecounts-20160125-100000.gz, size 99M\n",
"pagecounts-20160125-110000.gz, size 97M\n",
"pagecounts-20160125-120000.gz, size 99M\n",
"pagecounts-20160125-130000.gz, size 96M\n",
"pagecounts-20160125-140000.gz, size 101M\n",
"pagecounts-20160125-150000.gz, size 103M\n",
"pagecounts-20160125-160000.gz, size 104M\n",
"pagecounts-20160125-170000.gz, size 102M\n",
"pagecounts-20160125-180000.gz, size 100M\n",
"pagecounts-20160125-190000.gz, size 97M\n",
"pagecounts-20160125-200000.gz, size 97M\n",
"pagecounts-20160125-210000.gz, size 97M\n",
"pagecounts-20160125-220000.gz, size 93M\n",
"pagecounts-20160125-230000.gz, size 89M\n",
"pagecounts-20160126-000000.gz, size 85M\n",
"pagecounts-20160126-010000.gz, size 88M\n",
"pagecounts-20160126-020000.gz, size 86M\n",
"pagecounts-20160126-030000.gz, size 83M\n",
"pagecounts-20160126-040000.gz, size 83M\n",
"pagecounts-20160126-050000.gz, size 82M\n",
"pagecounts-20160126-060000.gz, size 84M\n",
"pagecounts-20160126-070000.gz, size 85M\n",
"pagecounts-20160126-080000.gz, size 90M\n",
"pagecounts-20160126-090000.gz, size 94M\n",
"pagecounts-20160126-100000.gz, size 95M\n",
"pagecounts-20160126-110000.gz, size 95M\n",
"pagecounts-20160126-120000.gz, size 96M\n",
"pagecounts-20160126-130000.gz, size 93M\n",
"pagecounts-20160126-140000.gz, size 98M\n",
"pagecounts-20160126-150000.gz, size 100M\n",
"pagecounts-20160126-160000.gz, size 100M\n",
"pagecounts-20160126-170000.gz, size 99M\n",
"pagecounts-20160126-180000.gz, size 98M\n",
"pagecounts-20160126-190000.gz, size 94M\n",
"pagecounts-20160126-200000.gz, size 96M\n",
"pagecounts-20160126-210000.gz, size 94M\n",
"pagecounts-20160126-220000.gz, size 90M\n",
"pagecounts-20160126-230000.gz, size 88M\n",
"pagecounts-20160127-000000.gz, size 84M\n",
"pagecounts-20160127-010000.gz, size 87M\n",
"pagecounts-20160127-020000.gz, size 81M\n",
"pagecounts-20160127-030000.gz, size 81M\n",
"pagecounts-20160127-040000.gz, size 78M\n",
"pagecounts-20160127-050000.gz, size 79M\n",
"pagecounts-20160127-060000.gz, size 81M\n",
"pagecounts-20160127-070000.gz, size 82M\n",
"pagecounts-20160127-080000.gz, size 86M\n",
"pagecounts-20160127-090000.gz, size 91M\n",
"pagecounts-20160127-100000.gz, size 96M\n",
"pagecounts-20160127-110000.gz, size 96M\n",
"pagecounts-20160127-120000.gz, size 95M\n",
"pagecounts-20160127-130000.gz, size 94M\n",
"pagecounts-20160127-140000.gz, size 95M\n",
"pagecounts-20160127-150000.gz, size 97M\n",
"pagecounts-20160127-160000.gz, size 96M\n",
"pagecounts-20160127-170000.gz, size 96M\n",
"pagecounts-20160127-180000.gz, size 92M\n",
"pagecounts-20160127-190000.gz, size 92M\n",
"pagecounts-20160127-200000.gz, size 90M\n",
"pagecounts-20160127-210000.gz, size 90M\n",
"pagecounts-20160127-220000.gz, size 89M\n",
"pagecounts-20160127-230000.gz, size 86M\n",
"pagecounts-20160128-000000.gz, size 80M\n",
"pagecounts-20160128-010000.gz, size 82M\n",
"pagecounts-20160128-020000.gz, size 82M\n",
"pagecounts-20160128-030000.gz, size 81M\n",
"pagecounts-20160128-040000.gz, size 80M\n",
"pagecounts-20160128-050000.gz, size 78M\n",
"pagecounts-20160128-060000.gz, size 80M\n",
"pagecounts-20160128-070000.gz, size 83M\n",
"pagecounts-20160128-080000.gz, size 88M\n",
"pagecounts-20160128-090000.gz, size 94M\n",
"pagecounts-20160128-100000.gz, size 100M\n",
"pagecounts-20160128-110000.gz, size 100M\n",
"pagecounts-20160128-120000.gz, size 100M\n",
"pagecounts-20160128-130000.gz, size 97M\n",
"pagecounts-20160128-140000.gz, size 96M\n",
"pagecounts-20160128-150000.gz, size 99M\n",
"pagecounts-20160128-160000.gz, size 99M\n",
"pagecounts-20160128-170000.gz, size 97M\n",
"pagecounts-20160128-180000.gz, size 95M\n",
"pagecounts-20160128-190000.gz, size 96M\n",
"pagecounts-20160128-200000.gz, size 96M\n",
"pagecounts-20160128-210000.gz, size 96M\n",
"pagecounts-20160128-220000.gz, size 94M\n",
"pagecounts-20160128-230000.gz, size 93M\n",
"pagecounts-20160129-000000.gz, size 87M\n",
"pagecounts-20160129-010000.gz, size 90M\n",
"pagecounts-20160129-020000.gz, size 87M\n",
"pagecounts-20160129-030000.gz, size 86M\n",
"pagecounts-20160129-040000.gz, size 84M\n",
"pagecounts-20160129-050000.gz, size 85M\n",
"pagecounts-20160129-060000.gz, size 83M\n",
"pagecounts-20160129-070000.gz, size 87M\n",
"pagecounts-20160129-080000.gz, size 91M\n",
"pagecounts-20160129-090000.gz, size 94M\n",
"pagecounts-20160129-100000.gz, size 99M\n",
"pagecounts-20160129-110000.gz, size 96M\n",
"pagecounts-20160129-120000.gz, size 97M\n",
"pagecounts-20160129-130000.gz, size 96M\n",
"pagecounts-20160129-140000.gz, size 100M\n",
"pagecounts-20160129-150000.gz, size 98M\n",
"pagecounts-20160129-160000.gz, size 97M\n",
"pagecounts-20160129-170000.gz, size 97M\n",
"pagecounts-20160129-180000.gz, size 94M\n",
"pagecounts-20160129-190000.gz, size 97M\n",
"pagecounts-20160129-200000.gz, size 97M\n",
"pagecounts-20160129-210000.gz, size 95M\n",
"pagecounts-20160129-220000.gz, size 89M\n",
"pagecounts-20160129-230000.gz, size 83M\n",
"pagecounts-20160130-000000.gz, size 79M\n",
"pagecounts-20160130-010000.gz, size 81M\n",
"pagecounts-20160130-020000.gz, size 79M\n",
"pagecounts-20160130-030000.gz, size 72M\n",
"pagecounts-20160130-040000.gz, size 70M\n",
"pagecounts-20160130-050000.gz, size 71M\n",
"pagecounts-20160130-060000.gz, size 72M\n",
"pagecounts-20160130-070000.gz, size 73M\n",
"pagecounts-20160130-080000.gz, size 77M\n",
"pagecounts-20160130-090000.gz, size 81M\n",
"pagecounts-20160130-100000.gz, size 87M\n",
"pagecounts-20160130-110000.gz, size 90M\n",
"pagecounts-20160130-120000.gz, size 95M\n",
"pagecounts-20160130-130000.gz, size 94M\n",
"pagecounts-20160130-140000.gz, size 94M\n",
"pagecounts-20160130-150000.gz, size 96M\n",
"pagecounts-20160130-160000.gz, size 99M\n",
"pagecounts-20160130-170000.gz, size 96M\n",
"pagecounts-20160130-180000.gz, size 92M\n",
"pagecounts-20160130-190000.gz, size 89M\n",
"pagecounts-20160130-200000.gz, size 89M\n",
"pagecounts-20160130-210000.gz, size 91M\n",
"pagecounts-20160130-220000.gz, size 88M\n",
"pagecounts-20160130-230000.gz, size 85M\n",
"pagecounts-20160131-000000.gz, size 83M\n",
"pagecounts-20160131-010000.gz, size 87M\n",
"pagecounts-20160131-020000.gz, size 86M\n",
"pagecounts-20160131-030000.gz, size 84M\n",
"pagecounts-20160131-040000.gz, size 81M\n",
"pagecounts-20160131-050000.gz, size 79M\n",
"pagecounts-20160131-060000.gz, size 83M\n",
"pagecounts-20160131-070000.gz, size 79M\n",
"pagecounts-20160131-080000.gz, size 81M\n",
"pagecounts-20160131-090000.gz, size 88M\n",
"pagecounts-20160131-100000.gz, size 93M\n",
"pagecounts-20160131-110000.gz, size 98M\n",
"pagecounts-20160131-120000.gz, size 98M\n",
"pagecounts-20160131-130000.gz, size 97M\n",
"pagecounts-20160131-140000.gz, size 98M\n",
"pagecounts-20160131-150000.gz, size 99M\n",
"pagecounts-20160131-160000.gz, size 102M\n",
"pagecounts-20160131-170000.gz, size 107M\n",
"pagecounts-20160131-180000.gz, size 106M\n",
"pagecounts-20160131-190000.gz, size 104M\n",
"pagecounts-20160131-200000.gz, size 101M\n",
"pagecounts-20160131-210000.gz, size 98M\n",
"pagecounts-20160131-220000.gz, size 92M\n",
"pagecounts-20160131-230000.gz, size 89M\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cfae8e3a",
"metadata": {},
"outputs": [],
"source": [
"import requests, os\n",
"\n",
"def download_pagecounts():\n",
" out_dir = os.path.join('data', 'pagecounts')\n",
" if not os.path.exists(out_dir): os.makedirs(out_dir)\n",
" for i in links.split('\\n'):\n",
" if i.strip():\n",
" filename = i.split()[0].rstrip(',')\n",
" url = 'https://dumps.wikimedia.org/other/pagecounts-raw/2016/2016-01/{}'.format(filename)\n",
" open(os.path.join(out_dir, filename), 'wb').write(requests.get(url, allow_redirects=True).content)\n",
" for i in glob.glob(os.path.join('data', 'pagecounts', '*')):\n",
" os.system('gunzip ' + i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6e4dd76",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import hashlib\n",
"import shutil\n",
"import glob\n",
"import json\n",
"import html\n",
"import time\n",
"import wget\n",
"import os\n",
"\n",
"def get_md5(s):\n",
" s = '_'.join(s.split())\n",
" s = html.unescape(s)\n",
" s = format_page_name(s)\n",
" m = hashlib.md5()\n",
" m.update(s.encode('utf8'))\n",
" return str(m.hexdigest())\n",
"\n",
"# columns represent: language/project, title, views, content size\n",
"for i in dataset: dataset[i]['views'] = 0\n",
" \n",
"for day in range(1, 32, 1):\n",
" day = str(day)\n",
" if len(day) < 2: day = '0' + day\n",
" print(' * processing day', day)\n",
" files = glob.glob(os.path.join('data', 'pagecounts', 'pagecounts-201601{}*'.format(day)))\n",
" for idx, i in enumerate(files):\n",
" with open(i) as f:\n",
" for j in f:\n",
" if not j.strip(): continue\n",
" try:\n",
" lang, name, views, size = j.split()\n",
" except ValueError:\n",
" continue\n",
" views = int(views)\n",
" if lang == 'en' and name in dataset:\n",
" dataset[name]['views'] += views"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6e25562",
"metadata": {},
"outputs": [],
"source": [
"with open(os.path.join('data', 'json', 'pagecounts.json'), 'w') as out:\n",
" json.dump(d, out)"
]
},
{
"cell_type": "markdown",
"id": "05d4ee04",
"metadata": {},
"source": [
"# Unused: Collect all page images"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df9593bb",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import hashlib\n",
"import shutil\n",
"import glob\n",
"import json\n",
"import time\n",
"import wget\n",
"import os\n",
"\n",
"image_urls = {}\n",
"\n",
"def get_md5(s):\n",
" m = hashlib.md5()\n",
" m.update(s.encode('utf8'))\n",
" return str(m.hexdigest())\n",
"\n",
"def get_image_urls(filename):\n",
" filename = filename.replace('.svg', '.png')\n",
" filename = '_'.join(filename.split())\n",
" filename = html.unescape(filename)\n",
" hashed = get_md5(filename)\n",
" fullsize_url = 'https://upload.wikimedia.org/wikipedia/commons/{}/{}/{}'.format(\n",
" hashed[0],\n",
" hashed[:2],\n",
" filename\n",
" )\n",
" thumb_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/{}/{}/{}/200px-{}'.format(\n",
" hashed[0],\n",
" hashed[:2],\n",
" filename,\n",
" filename\n",
" )\n",
" return {\n",
" 'full': fullsize_url,\n",
" 'thumb': thumb_url,\n",
" }\n",
"\n",
"def get_selected_image_urls():\n",
" image_urls = {}\n",
" for idx, i in enumerate(sorted(d.keys(), reverse=True)):\n",
" print(' * processing bucket', i)\n",
" if i < 40: break\n",
" for name in d[i]:\n",
" k = dataset[name]['raw']\n",
" article_images = []\n",
" # find the first image in the article\n",
" files = k.split('File:') \n",
" if len(files) > 0:\n",
" for file in files[1:]:\n",
" filename = file.split('|')[0].split(']')[0]\n",
" urls = get_image_urls(filename)\n",
" article_images.append(urls)\n",
" image_urls[name] = article_images\n",
" with open(os.path.join('data', 'json', 'image_urls.json'), 'w') as out:\n",
" json.dump(image_urls, out)\n",
" \n",
"def download_image(url, filename):\n",
" return os.system('wget \"{}\" -O \"{}\" -q'.format(url, filename))\n",
" \n",
"def download_images():\n",
" image_urls = json.load(open(os.path.join('data', 'json', 'image_urls.json')))\n",
" for name_idx, name in enumerate(image_urls):\n",
" urls = image_urls[name]\n",
" if not urls: continue\n",
" for url_idx, url in enumerate(urls[:10]):\n",
" url = '_'.join(url['full'].split())\n",
" ext = url.split('.')[-1]\n",
" if url.lower().endswith('.jpg') or url.lower().endswith('.png'):\n",
" try:\n",
" filename = os.path.join('data', 'images', format_page_name(name) + '-{}.jpg'.format(url_idx))\n",
" if not os.path.exists(filename):\n",
" download_image(url, filename)\n",
" except:\n",
" pass\n",
" \n",
"#get_selected_image_urls()\n",
"#download_images()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment