Created
September 9, 2021 16:35
-
-
Save duhaime/db54a10161112db6eabdcc0121bd5938 to your computer and use it in GitHub Desktop.
get_wiki_people.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "8c055fde", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%load_ext autoreload\n", | |
"%autoreload 2\n", | |
"\n", | |
"dataset = {}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "34e10acb", | |
"metadata": {}, | |
"source": [ | |
"# Get Text Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f4d07f06", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from multiprocessing import Pool\n", | |
"import json\n", | |
"\n", | |
"def process_batch(l, idx):\n", | |
" '''Process a batch of records'''\n", | |
" with open(os.path.join(humans, 'humans-' + str(idx) + '.json'), 'w') as out:\n", | |
" json.dump(l, out)\n", | |
"\n", | |
"def keep_page(page):\n", | |
" '''Return a bool indicating if this page should be retained for analysis'''\n", | |
" return is_human_page(page)\n", | |
" \n", | |
"def is_human_page(page):\n", | |
" '''Return a bool indicating if this is a human page'''\n", | |
" if not 'births]]' in page: return False\n", | |
" if is_bad_page(page): return False\n", | |
" return True\n", | |
"\n", | |
"def is_bad_page(page):\n", | |
" '''Return a bool indicating if this page is whack'''\n", | |
" name = get_page_name(page)\n", | |
" if name.startswith('Wikipedia:'): return True\n", | |
" if name.startswith('Template:'): return True\n", | |
" if name.startswith('Draft:'): return True\n", | |
" return False\n", | |
"\n", | |
"def get_page_name(page):\n", | |
" '''Given a wikipedia page return the page name'''\n", | |
" return page.split('<title>')[1].split('</title>')[0]\n", | |
"\n", | |
"def get_page_text(page):\n", | |
" '''Given a wikipedia page in XML form return plaintext for text mining'''\n", | |
" try:\n", | |
" c = page.split('<text ')[1].split('>')[1].split('</text>')[0]\n", | |
" c = remove_markup(c)\n", | |
" c = BeautifulSoup(c).get_text()\n", | |
" c = c.replace('=', ' ')\n", | |
" c = c.lower()\n", | |
" c = c.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))\n", | |
" c = ' '.join(c.split())\n", | |
" return c\n", | |
" except Exception as exc:\n", | |
" print(exc)\n", | |
" return ''\n", | |
" \n", | |
"def parse_pages():\n", | |
" # downloaded from https://dumps.wikimedia.org/enwiki/20210820/enwiki-20210820-pages-articles-multistream.xml.bz2\n", | |
" # if that's gone, look in https://dumps.wikimedia.org/enwiki/ for a more recent export with same pages-articles-multistream pattern\n", | |
" with open(os.path.join('data', 'xml', 'enwiki-20210820-pages-articles-multistream.xml')) as f:\n", | |
" l = []\n", | |
" current = ''\n", | |
" headers_clipped = False\n", | |
" batch_count = 0\n", | |
" for idx, i in enumerate(f):\n", | |
" if '<page>' in i:\n", | |
" # we've accumulated a full page -- if it's a person page add it\n", | |
" if current: \n", | |
" if keep_page(current):\n", | |
" l.append(current)\n", | |
" current = ''\n", | |
" if len(l) >= 1000:\n", | |
" process_batch(l, batch_count)\n", | |
" l = []\n", | |
" batch_count += 1\n", | |
" if headers_clipped:\n", | |
" current += i\n", | |
" if '</siteinfo>' in i:\n", | |
" headers_clipped = True\n", | |
" process_batch(l, batch_count)\n", | |
" # print total processed\n", | |
" print(' * processed', batch_count * 1000)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "053d9a40", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from wikitextparser import remove_markup\n", | |
"from bs4 import BeautifulSoup\n", | |
"import hashlib\n", | |
"import string\n", | |
"import glob\n", | |
"import json\n", | |
"import html\n", | |
"import os\n", | |
"\n", | |
"for i in glob.glob(os.path.join('data', 'humans', '*')):\n", | |
" for j in json.load(open(i)):\n", | |
" if is_human_page(j):\n", | |
" name = get_page_name(j)\n", | |
" #text = get_page_text(j),\n", | |
" dataset[name] = {\n", | |
" 'raw': j,\n", | |
" 'length': len(j),\n", | |
" }\n", | |
" \n", | |
"with open(os.path.join('data', 'json', 'dataset.json'), 'w') as out:\n", | |
" json.dump(dataset, out)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "c4a0f773", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline\n", | |
"import matplotlib.pyplot as plt\n", | |
"\n", | |
"# histogram of bio lengths\n", | |
"l = [len(dataset[i]['raw']) for i in dataset if len(dataset[i]['raw']) < 20000]\n", | |
"\n", | |
"plt.hist(l, bins=100)\n", | |
"plt.show()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "08e703be", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# find some example people at various thresholds of bio length\n", | |
"from collections import defaultdict\n", | |
"\n", | |
"d = defaultdict(list)\n", | |
"\n", | |
"for i in dataset:\n", | |
" biolen = dataset[i]['length']\n", | |
" lenbin = biolen // 5000\n", | |
" d[lenbin].append(i)\n", | |
" \n", | |
"sorted(d.keys(), reverse=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "43c3a6a2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# store the length bin of all people and delete those with short bios\n", | |
"keepers = set([j for i in d if i >= 2 for j in d[i]])\n", | |
"to_delete = []\n", | |
"for i in dataset:\n", | |
" if i not in keepers:\n", | |
" to_delete.append(i)\n", | |
" \n", | |
"for i in to_delete:\n", | |
" del dataset[i]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "77e8dcc9", | |
"metadata": {}, | |
"source": [ | |
"# Get Page Image and Bio" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "a756c930", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json, os\n", | |
"\n", | |
"dataset = json.load(open(os.path.join('data', 'json', 'dataset.json')))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "b7760210", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"type(dataset), len(dataset)\n", | |
"\n", | |
"keys = list(dataset.keys())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "16b52133", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# get a smaller group to focus on\n", | |
"l = [i for i in keys if len(dataset[i]['raw']) >= 50000]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "58050154", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0 2021-09-09 09:21:50.938463\n", | |
"100 2021-09-09 09:22:49.699364\n", | |
"200 2021-09-09 09:23:56.098555\n", | |
"300 2021-09-09 09:24:56.744201\n", | |
"400 2021-09-09 09:25:57.821086\n", | |
"500 2021-09-09 09:26:57.883951\n", | |
"600 2021-09-09 09:27:59.442425\n", | |
"700 2021-09-09 09:29:01.890677\n", | |
"800 2021-09-09 09:30:01.504646\n", | |
"900 2021-09-09 09:31:00.793757\n", | |
"1000 2021-09-09 09:32:00.160513\n", | |
"1100 2021-09-09 09:33:00.319626\n", | |
"1200 2021-09-09 09:34:01.462002\n", | |
"1300 2021-09-09 09:35:00.213132\n", | |
"1400 2021-09-09 09:36:04.343532\n", | |
"1500 2021-09-09 09:37:03.728502\n", | |
"1600 2021-09-09 09:38:04.409510\n", | |
"1700 2021-09-09 09:39:05.645563\n", | |
"1800 2021-09-09 09:40:05.512104\n", | |
"1900 2021-09-09 09:41:04.983853\n", | |
"2000 2021-09-09 09:42:05.071490\n", | |
"2100 2021-09-09 09:43:04.250173\n", | |
"2200 2021-09-09 09:44:05.823320\n", | |
"2300 2021-09-09 09:45:05.642249\n", | |
"2400 2021-09-09 09:46:09.129941\n", | |
"2500 2021-09-09 09:47:10.542313\n", | |
"2600 2021-09-09 09:48:09.532581\n", | |
"2700 2021-09-09 09:49:16.028473\n", | |
"2800 2021-09-09 09:50:19.870628\n", | |
"2900 2021-09-09 09:51:24.437579\n", | |
"3000 2021-09-09 09:52:25.665282\n", | |
"3100 2021-09-09 09:53:30.297827\n", | |
"3200 2021-09-09 09:54:44.955433\n", | |
"3300 2021-09-09 09:55:52.429292\n", | |
"3400 2021-09-09 09:56:56.851773\n", | |
"3500 2021-09-09 09:58:04.255281\n", | |
"3600 2021-09-09 09:59:08.066332\n", | |
"3700 2021-09-09 10:00:13.600882\n", | |
"3800 2021-09-09 10:01:20.624927\n", | |
"3900 2021-09-09 10:02:25.553990\n", | |
"4000 2021-09-09 10:03:34.968171\n", | |
"4100 2021-09-09 10:04:37.125105\n", | |
"4200 2021-09-09 10:05:40.407464\n", | |
"4300 2021-09-09 10:06:45.158944\n", | |
"4400 2021-09-09 10:07:45.045416\n", | |
"4500 2021-09-09 10:09:04.535555\n", | |
"4600 2021-09-09 10:10:08.300479\n", | |
"4700 2021-09-09 10:11:09.304536\n", | |
"4800 2021-09-09 10:12:19.841213\n", | |
"4900 2021-09-09 10:13:18.449895\n", | |
"5000 2021-09-09 10:14:16.242837\n", | |
"5100 2021-09-09 10:15:13.686099\n", | |
"5200 2021-09-09 10:16:11.130482\n", | |
"5300 2021-09-09 10:17:12.401129\n", | |
"5400 2021-09-09 10:18:09.957899\n", | |
"5500 2021-09-09 10:19:07.357226\n", | |
"5600 2021-09-09 10:20:04.952373\n", | |
"5700 2021-09-09 10:21:04.335310\n", | |
"5800 2021-09-09 10:22:05.479684\n", | |
"5900 2021-09-09 10:23:02.874859\n", | |
"6000 2021-09-09 10:24:00.527678\n", | |
"6100 2021-09-09 10:24:58.551270\n", | |
"6200 2021-09-09 10:25:56.270927\n", | |
"6300 2021-09-09 10:26:55.179370\n", | |
"6400 2021-09-09 10:27:55.873549\n", | |
"6500 2021-09-09 10:28:52.673341\n", | |
"6600 2021-09-09 10:29:49.936690\n", | |
"6700 2021-09-09 10:30:47.451767\n", | |
"6800 2021-09-09 10:31:44.667065\n", | |
"6900 2021-09-09 10:33:19.921487\n", | |
"7000 2021-09-09 10:34:17.552654\n", | |
"7100 2021-09-09 10:36:21.059461\n", | |
"7200 2021-09-09 10:37:21.757798\n", | |
"7300 2021-09-09 10:38:18.303951\n", | |
"7400 2021-09-09 10:39:15.664794\n", | |
"7500 2021-09-09 10:40:12.685598\n", | |
"7600 2021-09-09 10:42:25.327668\n", | |
"7700 2021-09-09 10:43:22.774925\n", | |
"7800 2021-09-09 10:44:21.692950\n", | |
"7900 2021-09-09 10:45:18.834321\n", | |
"8000 2021-09-09 10:46:19.470151\n", | |
"8100 2021-09-09 10:47:17.386969\n", | |
"8200 2021-09-09 10:48:14.879007\n", | |
"8300 2021-09-09 10:49:15.259692\n", | |
"8400 2021-09-09 10:50:13.423401\n", | |
"8500 2021-09-09 10:51:13.040644\n", | |
"8600 2021-09-09 10:52:10.386851\n", | |
"8700 2021-09-09 10:53:08.048590\n", | |
"8800 2021-09-09 10:54:05.640330\n", | |
"8900 2021-09-09 10:55:06.592415\n" | |
] | |
} | |
], | |
"source": [ | |
"from SPARQLWrapper import SPARQLWrapper, JSON\n", | |
"import html, datetime\n", | |
"\n", | |
"def format_page_name(s):\n", | |
" s = '_'.join(s.split())\n", | |
" #s = html.unescape(s)\n", | |
" return s\n", | |
"\n", | |
"def get_dbpedia_metadata(name):\n", | |
" '''Given a pagename get the dbpedia image and metadata'''\n", | |
" name = format_page_name(name)\n", | |
"\n", | |
" # nb: the following query can be tested on https://dbpedia.org/sparql\n", | |
" query = '''\n", | |
" prefix dbpedia: <http://dbpedia.org/resource/>\n", | |
" prefix dbpedia-owl: <http://dbpedia.org/ontology/>\n", | |
"\n", | |
" select ?abstract ?thumbnail where { \n", | |
" <http://dbpedia.org/resource/''' + name + '''> dbpedia-owl:abstract ?abstract ;\n", | |
" dbpedia-owl:thumbnail ?thumbnail .\n", | |
" filter(langMatches(lang(?abstract),\"en\"))\n", | |
" }\n", | |
" '''\n", | |
"\n", | |
" sparql = SPARQLWrapper(\"http://dbpedia.org/sparql\")\n", | |
" sparql.setReturnFormat(JSON)\n", | |
" sparql.setQuery(query)\n", | |
" j = sparql.query().convert()\n", | |
" try:\n", | |
" result = j['results']['bindings'][0]\n", | |
" return {\n", | |
" 'name': name,\n", | |
" 'abstract': result['abstract']['value'],\n", | |
" 'thumb': result['thumbnail']['value'],\n", | |
" }\n", | |
" except:\n", | |
" return None\n", | |
"\n", | |
"results = []\n", | |
"for idx, i in enumerate(l):\n", | |
" if idx%100 == 0: print(idx, datetime.datetime.now())\n", | |
" try:\n", | |
" result = get_dbpedia_metadata(i)\n", | |
" if not result: continue\n", | |
" d = dataset[i]\n", | |
" d.update(result)\n", | |
" results.append(d)\n", | |
" except:\n", | |
" continue" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "74048443", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%load_ext autoreload\n", | |
"%autoreload 2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "af5b8371", | |
"metadata": {}, | |
"source": [ | |
"# Extract Faces" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "9affe20c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib.pyplot as plt\n", | |
"from matplotlib import pyplot\n", | |
"from mtcnn.mtcnn import MTCNN\n", | |
"import os, json, glob, imageio\n", | |
"import numpy as np\n", | |
"\n", | |
"image_urls = json.load(open(os.path.join('data', 'json', 'image_urls.json')))\n", | |
"detector = MTCNN()\n", | |
"\n", | |
"for name in image_urls:\n", | |
" name = format_page_name(name)\n", | |
" images = glob.glob(os.path.join('data', 'images', name + '-*'))\n", | |
" print(name, 'has', len(images), 'images')\n", | |
" for image_idx, image in enumerate(images):\n", | |
" try:\n", | |
" im = pyplot.imread(image)[:,:,:3]\n", | |
" except:\n", | |
" os.remove(image)\n", | |
" continue\n", | |
" try:\n", | |
" results = detector.detect_faces(im)\n", | |
" for result_idx, result in enumerate(results):\n", | |
" confidence = result['confidence']\n", | |
" points = result['keypoints'] # each is x, y\n", | |
" x1, y1, width, height = result['box']\n", | |
" if confidence < 0.95: continue\n", | |
" if width < 80 or height < 80: continue\n", | |
" # identify keypoints\n", | |
" le_x, le_y = points['left_eye']\n", | |
" re_x, re_y = points['right_eye']\n", | |
" lm_x, lm_y = points['mouth_left']\n", | |
" rm_x, rm_y = points['mouth_right']\n", | |
" no_x, no_y = points['nose']\n", | |
" # get pitch, roll, yaw\n", | |
" roll = np.mean([le_y - re_y, lm_y - rm_y]) / height\n", | |
" yaw = (((le_x + re_x) / 2) - no_x) / width\n", | |
" # crop face\n", | |
" pad_top = int(height * 0.2)\n", | |
" pad_side = int(width * 0.2)\n", | |
" x2, y2 = x1 + width, y1 + height\n", | |
" face = im[y1-pad_top:y2+pad_top, x1-pad_side:x2+pad_side]\n", | |
" filename = '{}_{}_{}_{}_{}_{}.jpg'.format(\n", | |
" name, \n", | |
" image_idx, \n", | |
" result_idx, \n", | |
" round(confidence, 2),\n", | |
" round(roll, 2),\n", | |
" round(yaw, 2),\n", | |
" )\n", | |
" path = os.path.join('data', 'faces', filename)\n", | |
" imageio.imwrite(path, face)\n", | |
" except Exception as exc:\n", | |
" print(exc)\n", | |
" continue" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f4575c07", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from keras_vggface.vggface import VGGFace\n", | |
"import numpy as np\n", | |
"\n", | |
"# get face vector for comparison and identification of same person across multiple images\n", | |
"model = VGGFace(model='resnet50')\n", | |
"a = np.random.rand(1,224,224,3)\n", | |
"z = model.predict(a)\n", | |
"z.shape" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "487e824e", | |
"metadata": {}, | |
"source": [ | |
"# Vectorize text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "d7dda2fa", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"from sklearn.decomposition import NMF\n", | |
"\n", | |
"n_topics = 100 \n", | |
"word_scalar = 1000\n", | |
"\n", | |
"vectorizer = TfidfVectorizer(\n", | |
" input = 'content', \n", | |
" stop_words = 'english', \n", | |
" min_df = 5, \n", | |
" max_df = 0.8, \n", | |
" max_features = word_scalar * n_topics, \n", | |
")\n", | |
"\n", | |
"model = NMF(\n", | |
" n_components = n_topics,\n", | |
" random_state = 1,\n", | |
" verbose = 1,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "dd1e7f9d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"texts = [{\n", | |
" 'name': i,\n", | |
" 'text': dataset[i]['text'][:200] + '...',\n", | |
" 'lenbin': dataset[i]['lenbin'],\n", | |
"} for i in dataset]\n", | |
"\n", | |
"with open(os.path.join('data', 'json', 'texts.json'), 'w') as out:\n", | |
" json.dump(texts, out)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "380017f3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"l = [dataset[i]['text'] for i in dataset]\n", | |
"\n", | |
"# create the term document matrix\n", | |
"D = vectorizer.fit_transform(l)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "51e9ea02", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# optionally store the column labels (distinct terms in vectorizer model)\n", | |
"words = vectorizer.get_feature_names()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "fda6f2f9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"# Get matrix W above -- one row per document, one column per topic\n", | |
"W = model.fit_transform(D)\n", | |
"\n", | |
"# Get matrix T above -- one row per topic, one column per unique word\n", | |
"T = model.components_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "aaca2b23", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# rename D to `documents_by_terms`\n", | |
"documents_by_terms = D\n", | |
"\n", | |
"# rename W to `documents_by_topics`\n", | |
"documents_by_topics = W\n", | |
"\n", | |
"# rename T to `topics_by_terms`\n", | |
"topics_by_terms = T" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "64387ee3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"np.save(os.path.join('data', 'npy', 'documents-by-topics'), documents_by_topics)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "821b3499", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from umap import UMAP\n", | |
"\n", | |
"z = UMAP(n_neighbors=8).fit_transform(documents_by_topics)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "0e50c968", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"colors = UMAP(n_neighbors=8, n_components=1).fit_transform(documents_by_topics)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ff1f3c8d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"import numpy as np\n", | |
"\n", | |
"def scale(a):\n", | |
" return (a - np.min(a)) / (np.max(a) - np.min(a))\n", | |
"\n", | |
"scaled_positions = (scale(z) - 0.5) * 2.0\n", | |
"scaled_colors = scale(colors)\n", | |
"\n", | |
"with open(os.path.join('data', 'json', 'positions.json'), 'w') as out:\n", | |
" json.dump({\n", | |
" 'positions': scaled_positions.tolist(),\n", | |
" 'colors': scaled_colors.squeeze().tolist(),\n", | |
" }, out)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "aa809b93", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"np.max(scaled_positions)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "2b7563b1", | |
"metadata": {}, | |
"source": [ | |
"# Unused: Page View Counts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "8fe12f3d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# from https://dumps.wikimedia.org/other/pagecounts-raw/2016/2016-01/\n", | |
"links = '''\n", | |
"pagecounts-20160101-000000.gz, size 66M\n", | |
"pagecounts-20160101-010000.gz, size 77M\n", | |
"pagecounts-20160101-020000.gz, size 72M\n", | |
"pagecounts-20160101-030000.gz, size 71M\n", | |
"pagecounts-20160101-040000.gz, size 69M\n", | |
"pagecounts-20160101-050000.gz, size 68M\n", | |
"pagecounts-20160101-060000.gz, size 69M\n", | |
"pagecounts-20160101-070000.gz, size 71M\n", | |
"pagecounts-20160101-080000.gz, size 73M\n", | |
"pagecounts-20160101-090000.gz, size 77M\n", | |
"pagecounts-20160101-100000.gz, size 83M\n", | |
"pagecounts-20160101-110000.gz, size 85M\n", | |
"pagecounts-20160101-120000.gz, size 85M\n", | |
"pagecounts-20160101-130000.gz, size 86M\n", | |
"pagecounts-20160101-140000.gz, size 84M\n", | |
"pagecounts-20160101-150000.gz, size 86M\n", | |
"pagecounts-20160101-160000.gz, size 89M\n", | |
"pagecounts-20160101-170000.gz, size 89M\n", | |
"pagecounts-20160101-180000.gz, size 93M\n", | |
"pagecounts-20160101-190000.gz, size 95M\n", | |
"pagecounts-20160101-200000.gz, size 92M\n", | |
"pagecounts-20160101-210000.gz, size 91M\n", | |
"pagecounts-20160101-220000.gz, size 92M\n", | |
"pagecounts-20160101-230000.gz, size 83M\n", | |
"pagecounts-20160102-000000.gz, size 81M\n", | |
"pagecounts-20160102-010000.gz, size 83M\n", | |
"pagecounts-20160102-020000.gz, size 76M\n", | |
"pagecounts-20160102-030000.gz, size 76M\n", | |
"pagecounts-20160102-040000.gz, size 75M\n", | |
"pagecounts-20160102-050000.gz, size 73M\n", | |
"pagecounts-20160102-060000.gz, size 72M\n", | |
"pagecounts-20160102-070000.gz, size 71M\n", | |
"pagecounts-20160102-080000.gz, size 75M\n", | |
"pagecounts-20160102-090000.gz, size 79M\n", | |
"pagecounts-20160102-100000.gz, size 84M\n", | |
"pagecounts-20160102-110000.gz, size 90M\n", | |
"pagecounts-20160102-120000.gz, size 90M\n", | |
"pagecounts-20160102-130000.gz, size 92M\n", | |
"pagecounts-20160102-140000.gz, size 95M\n", | |
"pagecounts-20160102-150000.gz, size 93M\n", | |
"pagecounts-20160102-160000.gz, size 94M\n", | |
"pagecounts-20160102-170000.gz, size 96M\n", | |
"pagecounts-20160102-180000.gz, size 95M\n", | |
"pagecounts-20160102-190000.gz, size 92M\n", | |
"pagecounts-20160102-200000.gz, size 92M\n", | |
"pagecounts-20160102-210000.gz, size 89M\n", | |
"pagecounts-20160102-220000.gz, size 87M\n", | |
"pagecounts-20160102-230000.gz, size 86M\n", | |
"pagecounts-20160103-000000.gz, size 82M\n", | |
"pagecounts-20160103-010000.gz, size 85M\n", | |
"pagecounts-20160103-020000.gz, size 80M\n", | |
"pagecounts-20160103-030000.gz, size 76M\n", | |
"pagecounts-20160103-040000.gz, size 75M\n", | |
"pagecounts-20160103-050000.gz, size 75M\n", | |
"pagecounts-20160103-060000.gz, size 77M\n", | |
"pagecounts-20160103-070000.gz, size 76M\n", | |
"pagecounts-20160103-080000.gz, size 80M\n", | |
"pagecounts-20160103-090000.gz, size 82M\n", | |
"pagecounts-20160103-100000.gz, size 88M\n", | |
"pagecounts-20160103-110000.gz, size 90M\n", | |
"pagecounts-20160103-120000.gz, size 92M\n", | |
"pagecounts-20160103-130000.gz, size 94M\n", | |
"pagecounts-20160103-140000.gz, size 94M\n", | |
"pagecounts-20160103-150000.gz, size 95M\n", | |
"pagecounts-20160103-160000.gz, size 93M\n", | |
"pagecounts-20160103-170000.gz, size 93M\n", | |
"pagecounts-20160103-180000.gz, size 94M\n", | |
"pagecounts-20160103-190000.gz, size 92M\n", | |
"pagecounts-20160103-200000.gz, size 91M\n", | |
"pagecounts-20160103-210000.gz, size 90M\n", | |
"pagecounts-20160103-220000.gz, size 90M\n", | |
"pagecounts-20160103-230000.gz, size 86M\n", | |
"pagecounts-20160104-000000.gz, size 80M\n", | |
"pagecounts-20160104-010000.gz, size 83M\n", | |
"pagecounts-20160104-020000.gz, size 83M\n", | |
"pagecounts-20160104-030000.gz, size 81M\n", | |
"pagecounts-20160104-040000.gz, size 79M\n", | |
"pagecounts-20160104-050000.gz, size 79M\n", | |
"pagecounts-20160104-060000.gz, size 79M\n", | |
"pagecounts-20160104-070000.gz, size 81M\n", | |
"pagecounts-20160104-080000.gz, size 84M\n", | |
"pagecounts-20160104-090000.gz, size 89M\n", | |
"pagecounts-20160104-100000.gz, size 94M\n", | |
"pagecounts-20160104-110000.gz, size 96M\n", | |
"pagecounts-20160104-120000.gz, size 92M\n", | |
"pagecounts-20160104-130000.gz, size 93M\n", | |
"pagecounts-20160104-140000.gz, size 98M\n", | |
"pagecounts-20160104-150000.gz, size 101M\n", | |
"pagecounts-20160104-160000.gz, size 102M\n", | |
"pagecounts-20160104-170000.gz, size 98M\n", | |
"pagecounts-20160104-180000.gz, size 98M\n", | |
"pagecounts-20160104-190000.gz, size 98M\n", | |
"pagecounts-20160104-200000.gz, size 96M\n", | |
"pagecounts-20160104-210000.gz, size 96M\n", | |
"pagecounts-20160104-220000.gz, size 96M\n", | |
"pagecounts-20160104-230000.gz, size 93M\n", | |
"pagecounts-20160105-000000.gz, size 88M\n", | |
"pagecounts-20160105-010000.gz, size 90M\n", | |
"pagecounts-20160105-020000.gz, size 84M\n", | |
"pagecounts-20160105-030000.gz, size 81M\n", | |
"pagecounts-20160105-040000.gz, size 81M\n", | |
"pagecounts-20160105-050000.gz, size 82M\n", | |
"pagecounts-20160105-060000.gz, size 85M\n", | |
"pagecounts-20160105-070000.gz, size 85M\n", | |
"pagecounts-20160105-080000.gz, size 90M\n", | |
"pagecounts-20160105-090000.gz, size 100M\n", | |
"pagecounts-20160105-100000.gz, size 103M\n", | |
"pagecounts-20160105-110000.gz, size 105M\n", | |
"pagecounts-20160105-120000.gz, size 105M\n", | |
"pagecounts-20160105-130000.gz, size 104M\n", | |
"pagecounts-20160105-140000.gz, size 106M\n", | |
"pagecounts-20160105-150000.gz, size 107M\n", | |
"pagecounts-20160105-160000.gz, size 106M\n", | |
"pagecounts-20160105-170000.gz, size 106M\n", | |
"pagecounts-20160105-180000.gz, size 105M\n", | |
"pagecounts-20160105-190000.gz, size 110M\n", | |
"pagecounts-20160105-200000.gz, size 112M\n", | |
"pagecounts-20160105-210000.gz, size 107M\n", | |
"pagecounts-20160105-220000.gz, size 110M\n", | |
"pagecounts-20160105-230000.gz, size 112M\n", | |
"pagecounts-20160106-000000.gz, size 102M\n", | |
"pagecounts-20160106-010000.gz, size 102M\n", | |
"pagecounts-20160106-020000.gz, size 93M\n", | |
"pagecounts-20160106-030000.gz, size 88M\n", | |
"pagecounts-20160106-040000.gz, size 87M\n", | |
"pagecounts-20160106-050000.gz, size 84M\n", | |
"pagecounts-20160106-060000.gz, size 90M\n", | |
"pagecounts-20160106-070000.gz, size 94M\n", | |
"pagecounts-20160106-080000.gz, size 99M\n", | |
"pagecounts-20160106-090000.gz, size 102M\n", | |
"pagecounts-20160106-100000.gz, size 104M\n", | |
"pagecounts-20160106-110000.gz, size 105M\n", | |
"pagecounts-20160106-120000.gz, size 104M\n", | |
"pagecounts-20160106-130000.gz, size 101M\n", | |
"pagecounts-20160106-140000.gz, size 104M\n", | |
"pagecounts-20160106-150000.gz, size 104M\n", | |
"pagecounts-20160106-160000.gz, size 103M\n", | |
"pagecounts-20160106-170000.gz, size 101M\n", | |
"pagecounts-20160106-180000.gz, size 101M\n", | |
"pagecounts-20160106-190000.gz, size 107M\n", | |
"pagecounts-20160106-200000.gz, size 105M\n", | |
"pagecounts-20160106-210000.gz, size 101M\n", | |
"pagecounts-20160106-220000.gz, size 103M\n", | |
"pagecounts-20160106-230000.gz, size 101M\n", | |
"pagecounts-20160107-000000.gz, size 92M\n", | |
"pagecounts-20160107-010000.gz, size 92M\n", | |
"pagecounts-20160107-020000.gz, size 90M\n", | |
"pagecounts-20160107-030000.gz, size 86M\n", | |
"pagecounts-20160107-040000.gz, size 85M\n", | |
"pagecounts-20160107-050000.gz, size 83M\n", | |
"pagecounts-20160107-060000.gz, size 83M\n", | |
"pagecounts-20160107-070000.gz, size 83M\n", | |
"pagecounts-20160107-080000.gz, size 90M\n", | |
"pagecounts-20160107-090000.gz, size 96M\n", | |
"pagecounts-20160107-100000.gz, size 99M\n", | |
"pagecounts-20160107-110000.gz, size 104M\n", | |
"pagecounts-20160107-120000.gz, size 103M\n", | |
"pagecounts-20160107-130000.gz, size 103M\n", | |
"pagecounts-20160107-140000.gz, size 104M\n", | |
"pagecounts-20160107-150000.gz, size 105M\n", | |
"pagecounts-20160107-160000.gz, size 108M\n", | |
"pagecounts-20160107-170000.gz, size 107M\n", | |
"pagecounts-20160107-180000.gz, size 106M\n", | |
"pagecounts-20160107-190000.gz, size 108M\n", | |
"pagecounts-20160107-200000.gz, size 106M\n", | |
"pagecounts-20160107-210000.gz, size 101M\n", | |
"pagecounts-20160107-220000.gz, size 104M\n", | |
"pagecounts-20160107-230000.gz, size 97M\n", | |
"pagecounts-20160108-000000.gz, size 91M\n", | |
"pagecounts-20160108-010000.gz, size 96M\n", | |
"pagecounts-20160108-020000.gz, size 90M\n", | |
"pagecounts-20160108-030000.gz, size 86M\n", | |
"pagecounts-20160108-040000.gz, size 86M\n", | |
"pagecounts-20160108-050000.gz, size 85M\n", | |
"pagecounts-20160108-060000.gz, size 89M\n", | |
"pagecounts-20160108-070000.gz, size 89M\n", | |
"pagecounts-20160108-080000.gz, size 96M\n", | |
"pagecounts-20160108-090000.gz, size 99M\n", | |
"pagecounts-20160108-100000.gz, size 100M\n", | |
"pagecounts-20160108-110000.gz, size 98M\n", | |
"pagecounts-20160108-120000.gz, size 97M\n", | |
"pagecounts-20160108-130000.gz, size 96M\n", | |
"pagecounts-20160108-140000.gz, size 100M\n", | |
"pagecounts-20160108-150000.gz, size 104M\n", | |
"pagecounts-20160108-160000.gz, size 103M\n", | |
"pagecounts-20160108-170000.gz, size 106M\n", | |
"pagecounts-20160108-180000.gz, size 103M\n", | |
"pagecounts-20160108-190000.gz, size 103M\n", | |
"pagecounts-20160108-200000.gz, size 102M\n", | |
"pagecounts-20160108-210000.gz, size 97M\n", | |
"pagecounts-20160108-220000.gz, size 95M\n", | |
"pagecounts-20160108-230000.gz, size 96M\n", | |
"pagecounts-20160109-000000.gz, size 89M\n", | |
"pagecounts-20160109-010000.gz, size 92M\n", | |
"pagecounts-20160109-020000.gz, size 86M\n", | |
"pagecounts-20160109-030000.gz, size 79M\n", | |
"pagecounts-20160109-040000.gz, size 75M\n", | |
"pagecounts-20160109-050000.gz, size 77M\n", | |
"pagecounts-20160109-060000.gz, size 75M\n", | |
"pagecounts-20160109-070000.gz, size 77M\n", | |
"pagecounts-20160109-080000.gz, size 79M\n", | |
"pagecounts-20160109-090000.gz, size 86M\n", | |
"pagecounts-20160109-100000.gz, size 92M\n", | |
"pagecounts-20160109-110000.gz, size 96M\n", | |
"pagecounts-20160109-120000.gz, size 98M\n", | |
"pagecounts-20160109-130000.gz, size 94M\n", | |
"pagecounts-20160109-140000.gz, size 96M\n", | |
"pagecounts-20160109-150000.gz, size 96M\n", | |
"pagecounts-20160109-160000.gz, size 98M\n", | |
"pagecounts-20160109-170000.gz, size 98M\n", | |
"pagecounts-20160109-180000.gz, size 99M\n", | |
"pagecounts-20160109-190000.gz, size 98M\n", | |
"pagecounts-20160109-200000.gz, size 98M\n", | |
"pagecounts-20160109-210000.gz, size 95M\n", | |
"pagecounts-20160109-220000.gz, size 93M\n", | |
"pagecounts-20160109-230000.gz, size 91M\n", | |
"pagecounts-20160110-000000.gz, size 87M\n", | |
"pagecounts-20160110-010000.gz, size 86M\n", | |
"pagecounts-20160110-020000.gz, size 84M\n", | |
"pagecounts-20160110-030000.gz, size 79M\n", | |
"pagecounts-20160110-040000.gz, size 78M\n", | |
"pagecounts-20160110-050000.gz, size 78M\n", | |
"pagecounts-20160110-060000.gz, size 76M\n", | |
"pagecounts-20160110-070000.gz, size 81M\n", | |
"pagecounts-20160110-080000.gz, size 82M\n", | |
"pagecounts-20160110-090000.gz, size 85M\n", | |
"pagecounts-20160110-100000.gz, size 91M\n", | |
"pagecounts-20160110-110000.gz, size 93M\n", | |
"pagecounts-20160110-120000.gz, size 93M\n", | |
"pagecounts-20160110-130000.gz, size 92M\n", | |
"pagecounts-20160110-140000.gz, size 93M\n", | |
"pagecounts-20160110-150000.gz, size 96M\n", | |
"pagecounts-20160110-160000.gz, size 98M\n", | |
"pagecounts-20160110-170000.gz, size 97M\n", | |
"pagecounts-20160110-180000.gz, size 97M\n", | |
"pagecounts-20160110-190000.gz, size 99M\n", | |
"pagecounts-20160110-200000.gz, size 99M\n", | |
"pagecounts-20160110-210000.gz, size 93M\n", | |
"pagecounts-20160110-220000.gz, size 91M\n", | |
"pagecounts-20160110-230000.gz, size 90M\n", | |
"pagecounts-20160111-000000.gz, size 82M\n", | |
"pagecounts-20160111-010000.gz, size 83M\n", | |
"pagecounts-20160111-020000.gz, size 79M\n", | |
"pagecounts-20160111-030000.gz, size 80M\n", | |
"pagecounts-20160111-040000.gz, size 82M\n", | |
"pagecounts-20160111-050000.gz, size 83M\n", | |
"pagecounts-20160111-060000.gz, size 82M\n", | |
"pagecounts-20160111-070000.gz, size 86M\n", | |
"pagecounts-20160111-080000.gz, size 90M\n", | |
"pagecounts-20160111-090000.gz, size 95M\n", | |
"pagecounts-20160111-100000.gz, size 102M\n", | |
"pagecounts-20160111-110000.gz, size 96M\n", | |
"pagecounts-20160111-120000.gz, size 100M\n", | |
"pagecounts-20160111-130000.gz, size 98M\n", | |
"pagecounts-20160111-140000.gz, size 101M\n", | |
"pagecounts-20160111-150000.gz, size 102M\n", | |
"pagecounts-20160111-160000.gz, size 102M\n", | |
"pagecounts-20160111-170000.gz, size 101M\n", | |
"pagecounts-20160111-180000.gz, size 99M\n", | |
"pagecounts-20160111-190000.gz, size 99M\n", | |
"pagecounts-20160111-200000.gz, size 98M\n", | |
"pagecounts-20160111-210000.gz, size 99M\n", | |
"pagecounts-20160111-220000.gz, size 95M\n", | |
"pagecounts-20160111-230000.gz, size 91M\n", | |
"pagecounts-20160112-000000.gz, size 91M\n", | |
"pagecounts-20160112-010000.gz, size 91M\n", | |
"pagecounts-20160112-020000.gz, size 84M\n", | |
"pagecounts-20160112-030000.gz, size 80M\n", | |
"pagecounts-20160112-040000.gz, size 81M\n", | |
"pagecounts-20160112-050000.gz, size 82M\n", | |
"pagecounts-20160112-060000.gz, size 81M\n", | |
"pagecounts-20160112-070000.gz, size 83M\n", | |
"pagecounts-20160112-080000.gz, size 90M\n", | |
"pagecounts-20160112-090000.gz, size 93M\n", | |
"pagecounts-20160112-100000.gz, size 96M\n", | |
"pagecounts-20160112-110000.gz, size 99M\n", | |
"pagecounts-20160112-120000.gz, size 97M\n", | |
"pagecounts-20160112-130000.gz, size 95M\n", | |
"pagecounts-20160112-140000.gz, size 96M\n", | |
"pagecounts-20160112-150000.gz, size 101M\n", | |
"pagecounts-20160112-160000.gz, size 99M\n", | |
"pagecounts-20160112-170000.gz, size 97M\n", | |
"pagecounts-20160112-180000.gz, size 99M\n", | |
"pagecounts-20160112-190000.gz, size 97M\n", | |
"pagecounts-20160112-200000.gz, size 98M\n", | |
"pagecounts-20160112-210000.gz, size 95M\n", | |
"pagecounts-20160112-220000.gz, size 97M\n", | |
"pagecounts-20160112-230000.gz, size 90M\n", | |
"pagecounts-20160113-000000.gz, size 84M\n", | |
"pagecounts-20160113-010000.gz, size 86M\n", | |
"pagecounts-20160113-020000.gz, size 85M\n", | |
"pagecounts-20160113-030000.gz, size 84M\n", | |
"pagecounts-20160113-040000.gz, size 81M\n", | |
"pagecounts-20160113-050000.gz, size 80M\n", | |
"pagecounts-20160113-060000.gz, size 71M\n", | |
"pagecounts-20160113-070000.gz, size 77M\n", | |
"pagecounts-20160113-080000.gz, size 81M\n", | |
"pagecounts-20160113-090000.gz, size 85M\n", | |
"pagecounts-20160113-100000.gz, size 90M\n", | |
"pagecounts-20160113-110000.gz, size 92M\n", | |
"pagecounts-20160113-120000.gz, size 91M\n", | |
"pagecounts-20160113-130000.gz, size 92M\n", | |
"pagecounts-20160113-140000.gz, size 100M\n", | |
"pagecounts-20160113-150000.gz, size 103M\n", | |
"pagecounts-20160113-160000.gz, size 103M\n", | |
"pagecounts-20160113-170000.gz, size 101M\n", | |
"pagecounts-20160113-180000.gz, size 101M\n", | |
"pagecounts-20160113-190000.gz, size 101M\n", | |
"pagecounts-20160113-200000.gz, size 99M\n", | |
"pagecounts-20160113-210000.gz, size 98M\n", | |
"pagecounts-20160113-220000.gz, size 97M\n", | |
"pagecounts-20160113-230000.gz, size 93M\n", | |
"pagecounts-20160114-000000.gz, size 83M\n", | |
"pagecounts-20160114-010000.gz, size 87M\n", | |
"pagecounts-20160114-020000.gz, size 85M\n", | |
"pagecounts-20160114-030000.gz, size 79M\n", | |
"pagecounts-20160114-040000.gz, size 80M\n", | |
"pagecounts-20160114-050000.gz, size 82M\n", | |
"pagecounts-20160114-060000.gz, size 86M\n", | |
"pagecounts-20160114-070000.gz, size 85M\n", | |
"pagecounts-20160114-080000.gz, size 92M\n", | |
"pagecounts-20160114-090000.gz, size 96M\n", | |
"pagecounts-20160114-100000.gz, size 100M\n", | |
"pagecounts-20160114-110000.gz, size 101M\n", | |
"pagecounts-20160114-120000.gz, size 100M\n", | |
"pagecounts-20160114-130000.gz, size 97M\n", | |
"pagecounts-20160114-140000.gz, size 102M\n", | |
"pagecounts-20160114-150000.gz, size 103M\n", | |
"pagecounts-20160114-160000.gz, size 100M\n", | |
"pagecounts-20160114-170000.gz, size 100M\n", | |
"pagecounts-20160114-180000.gz, size 101M\n", | |
"pagecounts-20160114-190000.gz, size 102M\n", | |
"pagecounts-20160114-200000.gz, size 102M\n", | |
"pagecounts-20160114-210000.gz, size 102M\n", | |
"pagecounts-20160114-220000.gz, size 98M\n", | |
"pagecounts-20160114-230000.gz, size 92M\n", | |
"pagecounts-20160115-000000.gz, size 86M\n", | |
"pagecounts-20160115-010000.gz, size 88M\n", | |
"pagecounts-20160115-020000.gz, size 86M\n", | |
"pagecounts-20160115-030000.gz, size 82M\n", | |
"pagecounts-20160115-040000.gz, size 81M\n", | |
"pagecounts-20160115-050000.gz, size 80M\n", | |
"pagecounts-20160115-060000.gz, size 80M\n", | |
"pagecounts-20160115-070000.gz, size 80M\n", | |
"pagecounts-20160115-080000.gz, size 88M\n", | |
"pagecounts-20160115-090000.gz, size 93M\n", | |
"pagecounts-20160115-100000.gz, size 94M\n", | |
"pagecounts-20160115-110000.gz, size 92M\n", | |
"pagecounts-20160115-120000.gz, size 92M\n", | |
"pagecounts-20160115-130000.gz, size 90M\n", | |
"pagecounts-20160115-140000.gz, size 94M\n", | |
"pagecounts-20160115-150000.gz, size 99M\n", | |
"pagecounts-20160115-160000.gz, size 99M\n", | |
"pagecounts-20160115-170000.gz, size 98M\n", | |
"pagecounts-20160115-180000.gz, size 95M\n", | |
"pagecounts-20160115-190000.gz, size 95M\n", | |
"pagecounts-20160115-200000.gz, size 95M\n", | |
"pagecounts-20160115-210000.gz, size 92M\n", | |
"pagecounts-20160115-220000.gz, size 89M\n", | |
"pagecounts-20160115-230000.gz, size 88M\n", | |
"pagecounts-20160116-000000.gz, size 79M\n", | |
"pagecounts-20160116-010000.gz, size 79M\n", | |
"pagecounts-20160116-020000.gz, size 77M\n", | |
"pagecounts-20160116-030000.gz, size 75M\n", | |
"pagecounts-20160116-040000.gz, size 77M\n", | |
"pagecounts-20160116-050000.gz, size 79M\n", | |
"pagecounts-20160116-060000.gz, size 81M\n", | |
"pagecounts-20160116-070000.gz, size 82M\n", | |
"pagecounts-20160116-080000.gz, size 82M\n", | |
"pagecounts-20160116-090000.gz, size 84M\n", | |
"pagecounts-20160116-100000.gz, size 90M\n", | |
"pagecounts-20160116-110000.gz, size 91M\n", | |
"pagecounts-20160116-120000.gz, size 89M\n", | |
"pagecounts-20160116-130000.gz, size 89M\n", | |
"pagecounts-20160116-140000.gz, size 92M\n", | |
"pagecounts-20160116-150000.gz, size 93M\n", | |
"pagecounts-20160116-160000.gz, size 95M\n", | |
"pagecounts-20160116-170000.gz, size 92M\n", | |
"pagecounts-20160116-180000.gz, size 94M\n", | |
"pagecounts-20160116-190000.gz, size 92M\n", | |
"pagecounts-20160116-200000.gz, size 89M\n", | |
"pagecounts-20160116-210000.gz, size 90M\n", | |
"pagecounts-20160116-220000.gz, size 89M\n", | |
"pagecounts-20160116-230000.gz, size 89M\n", | |
"pagecounts-20160117-000000.gz, size 83M\n", | |
"pagecounts-20160117-010000.gz, size 83M\n", | |
"pagecounts-20160117-020000.gz, size 81M\n", | |
"pagecounts-20160117-030000.gz, size 77M\n", | |
"pagecounts-20160117-040000.gz, size 74M\n", | |
"pagecounts-20160117-050000.gz, size 77M\n", | |
"pagecounts-20160117-060000.gz, size 76M\n", | |
"pagecounts-20160117-070000.gz, size 78M\n", | |
"pagecounts-20160117-080000.gz, size 83M\n", | |
"pagecounts-20160117-090000.gz, size 88M\n", | |
"pagecounts-20160117-100000.gz, size 91M\n", | |
"pagecounts-20160117-110000.gz, size 94M\n", | |
"pagecounts-20160117-120000.gz, size 93M\n", | |
"pagecounts-20160117-130000.gz, size 89M\n", | |
"pagecounts-20160117-140000.gz, size 95M\n", | |
"pagecounts-20160117-150000.gz, size 100M\n", | |
"pagecounts-20160117-160000.gz, size 99M\n", | |
"pagecounts-20160117-170000.gz, size 94M\n", | |
"pagecounts-20160117-180000.gz, size 95M\n", | |
"pagecounts-20160117-190000.gz, size 91M\n", | |
"pagecounts-20160117-200000.gz, size 91M\n", | |
"pagecounts-20160117-210000.gz, size 88M\n", | |
"pagecounts-20160117-220000.gz, size 86M\n", | |
"pagecounts-20160117-230000.gz, size 86M\n", | |
"pagecounts-20160118-000000.gz, size 85M\n", | |
"pagecounts-20160118-010000.gz, size 88M\n", | |
"pagecounts-20160118-020000.gz, size 85M\n", | |
"pagecounts-20160118-030000.gz, size 81M\n", | |
"pagecounts-20160118-040000.gz, size 83M\n", | |
"pagecounts-20160118-050000.gz, size 83M\n", | |
"pagecounts-20160118-060000.gz, size 84M\n", | |
"pagecounts-20160118-070000.gz, size 83M\n", | |
"pagecounts-20160118-080000.gz, size 89M\n", | |
"pagecounts-20160118-090000.gz, size 96M\n", | |
"pagecounts-20160118-100000.gz, size 101M\n", | |
"pagecounts-20160118-110000.gz, size 100M\n", | |
"pagecounts-20160118-120000.gz, size 98M\n", | |
"pagecounts-20160118-130000.gz, size 95M\n", | |
"pagecounts-20160118-140000.gz, size 100M\n", | |
"pagecounts-20160118-150000.gz, size 104M\n", | |
"pagecounts-20160118-160000.gz, size 104M\n", | |
"pagecounts-20160118-170000.gz, size 103M\n", | |
"pagecounts-20160118-180000.gz, size 103M\n", | |
"pagecounts-20160118-190000.gz, size 102M\n", | |
"pagecounts-20160118-200000.gz, size 99M\n", | |
"pagecounts-20160118-210000.gz, size 98M\n", | |
"pagecounts-20160118-220000.gz, size 96M\n", | |
"pagecounts-20160118-230000.gz, size 90M\n", | |
"pagecounts-20160119-000000.gz, size 88M\n", | |
"pagecounts-20160119-010000.gz, size 89M\n", | |
"pagecounts-20160119-020000.gz, size 86M\n", | |
"pagecounts-20160119-030000.gz, size 81M\n", | |
"pagecounts-20160119-040000.gz, size 80M\n", | |
"pagecounts-20160119-050000.gz, size 81M\n", | |
"pagecounts-20160119-060000.gz, size 85M\n", | |
"pagecounts-20160119-070000.gz, size 87M\n", | |
"pagecounts-20160119-080000.gz, size 94M\n", | |
"pagecounts-20160119-090000.gz, size 97M\n", | |
"pagecounts-20160119-100000.gz, size 102M\n", | |
"pagecounts-20160119-110000.gz, size 100M\n", | |
"pagecounts-20160119-120000.gz, size 98M\n", | |
"pagecounts-20160119-130000.gz, size 97M\n", | |
"pagecounts-20160119-140000.gz, size 101M\n", | |
"pagecounts-20160119-150000.gz, size 102M\n", | |
"pagecounts-20160119-160000.gz, size 101M\n", | |
"pagecounts-20160119-170000.gz, size 97M\n", | |
"pagecounts-20160119-180000.gz, size 95M\n", | |
"pagecounts-20160119-190000.gz, size 95M\n", | |
"pagecounts-20160119-200000.gz, size 95M\n", | |
"pagecounts-20160119-210000.gz, size 96M\n", | |
"pagecounts-20160119-220000.gz, size 93M\n", | |
"pagecounts-20160119-230000.gz, size 89M\n", | |
"pagecounts-20160120-000000.gz, size 82M\n", | |
"pagecounts-20160120-010000.gz, size 84M\n", | |
"pagecounts-20160120-020000.gz, size 83M\n", | |
"pagecounts-20160120-030000.gz, size 79M\n", | |
"pagecounts-20160120-040000.gz, size 77M\n", | |
"pagecounts-20160120-050000.gz, size 76M\n", | |
"pagecounts-20160120-060000.gz, size 81M\n", | |
"pagecounts-20160120-070000.gz, size 83M\n", | |
"pagecounts-20160120-080000.gz, size 88M\n", | |
"pagecounts-20160120-090000.gz, size 97M\n", | |
"pagecounts-20160120-100000.gz, size 100M\n", | |
"pagecounts-20160120-110000.gz, size 103M\n", | |
"pagecounts-20160120-120000.gz, size 100M\n", | |
"pagecounts-20160120-130000.gz, size 96M\n", | |
"pagecounts-20160120-140000.gz, size 97M\n", | |
"pagecounts-20160120-150000.gz, size 100M\n", | |
"pagecounts-20160120-160000.gz, size 101M\n", | |
"pagecounts-20160120-170000.gz, size 102M\n", | |
"pagecounts-20160120-180000.gz, size 98M\n", | |
"pagecounts-20160120-190000.gz, size 97M\n", | |
"pagecounts-20160120-200000.gz, size 96M\n", | |
"pagecounts-20160120-210000.gz, size 94M\n", | |
"pagecounts-20160120-220000.gz, size 90M\n", | |
"pagecounts-20160120-230000.gz, size 84M\n", | |
"pagecounts-20160121-000000.gz, size 81M\n", | |
"pagecounts-20160121-010000.gz, size 88M\n", | |
"pagecounts-20160121-020000.gz, size 83M\n", | |
"pagecounts-20160121-030000.gz, size 82M\n", | |
"pagecounts-20160121-040000.gz, size 83M\n", | |
"pagecounts-20160121-050000.gz, size 81M\n", | |
"pagecounts-20160121-060000.gz, size 82M\n", | |
"pagecounts-20160121-070000.gz, size 86M\n", | |
"pagecounts-20160121-080000.gz, size 90M\n", | |
"pagecounts-20160121-090000.gz, size 95M\n", | |
"pagecounts-20160121-100000.gz, size 98M\n", | |
"pagecounts-20160121-110000.gz, size 101M\n", | |
"pagecounts-20160121-120000.gz, size 99M\n", | |
"pagecounts-20160121-130000.gz, size 95M\n", | |
"pagecounts-20160121-140000.gz, size 98M\n", | |
"pagecounts-20160121-150000.gz, size 99M\n", | |
"pagecounts-20160121-160000.gz, size 99M\n", | |
"pagecounts-20160121-170000.gz, size 98M\n", | |
"pagecounts-20160121-180000.gz, size 97M\n", | |
"pagecounts-20160121-190000.gz, size 95M\n", | |
"pagecounts-20160121-200000.gz, size 92M\n", | |
"pagecounts-20160121-210000.gz, size 92M\n", | |
"pagecounts-20160121-220000.gz, size 95M\n", | |
"pagecounts-20160121-230000.gz, size 91M\n", | |
"pagecounts-20160122-000000.gz, size 88M\n", | |
"pagecounts-20160122-010000.gz, size 95M\n", | |
"pagecounts-20160122-020000.gz, size 89M\n", | |
"pagecounts-20160122-030000.gz, size 87M\n", | |
"pagecounts-20160122-040000.gz, size 84M\n", | |
"pagecounts-20160122-050000.gz, size 82M\n", | |
"pagecounts-20160122-060000.gz, size 84M\n", | |
"pagecounts-20160122-070000.gz, size 85M\n", | |
"pagecounts-20160122-080000.gz, size 90M\n", | |
"pagecounts-20160122-090000.gz, size 92M\n", | |
"pagecounts-20160122-100000.gz, size 99M\n", | |
"pagecounts-20160122-110000.gz, size 99M\n", | |
"pagecounts-20160122-120000.gz, size 98M\n", | |
"pagecounts-20160122-130000.gz, size 92M\n", | |
"pagecounts-20160122-140000.gz, size 97M\n", | |
"pagecounts-20160122-150000.gz, size 100M\n", | |
"pagecounts-20160122-160000.gz, size 97M\n", | |
"pagecounts-20160122-170000.gz, size 95M\n", | |
"pagecounts-20160122-180000.gz, size 94M\n", | |
"pagecounts-20160122-190000.gz, size 93M\n", | |
"pagecounts-20160122-200000.gz, size 89M\n", | |
"pagecounts-20160122-210000.gz, size 86M\n", | |
"pagecounts-20160122-220000.gz, size 82M\n", | |
"pagecounts-20160122-230000.gz, size 82M\n", | |
"pagecounts-20160123-000000.gz, size 78M\n", | |
"pagecounts-20160123-010000.gz, size 83M\n", | |
"pagecounts-20160123-020000.gz, size 84M\n", | |
"pagecounts-20160123-030000.gz, size 79M\n", | |
"pagecounts-20160123-040000.gz, size 81M\n", | |
"pagecounts-20160123-050000.gz, size 83M\n", | |
"pagecounts-20160123-060000.gz, size 83M\n", | |
"pagecounts-20160123-070000.gz, size 82M\n", | |
"pagecounts-20160123-080000.gz, size 86M\n", | |
"pagecounts-20160123-090000.gz, size 89M\n", | |
"pagecounts-20160123-100000.gz, size 93M\n", | |
"pagecounts-20160123-110000.gz, size 93M\n", | |
"pagecounts-20160123-120000.gz, size 94M\n", | |
"pagecounts-20160123-130000.gz, size 93M\n", | |
"pagecounts-20160123-140000.gz, size 95M\n", | |
"pagecounts-20160123-150000.gz, size 95M\n", | |
"pagecounts-20160123-160000.gz, size 96M\n", | |
"pagecounts-20160123-170000.gz, size 99M\n", | |
"pagecounts-20160123-180000.gz, size 100M\n", | |
"pagecounts-20160123-190000.gz, size 97M\n", | |
"pagecounts-20160123-200000.gz, size 95M\n", | |
"pagecounts-20160123-210000.gz, size 92M\n", | |
"pagecounts-20160123-220000.gz, size 92M\n", | |
"pagecounts-20160123-230000.gz, size 91M\n", | |
"pagecounts-20160124-000000.gz, size 85M\n", | |
"pagecounts-20160124-010000.gz, size 91M\n", | |
"pagecounts-20160124-020000.gz, size 83M\n", | |
"pagecounts-20160124-030000.gz, size 82M\n", | |
"pagecounts-20160124-040000.gz, size 81M\n", | |
"pagecounts-20160124-050000.gz, size 86M\n", | |
"pagecounts-20160124-060000.gz, size 85M\n", | |
"pagecounts-20160124-070000.gz, size 85M\n", | |
"pagecounts-20160124-080000.gz, size 86M\n", | |
"pagecounts-20160124-090000.gz, size 89M\n", | |
"pagecounts-20160124-100000.gz, size 96M\n", | |
"pagecounts-20160124-110000.gz, size 96M\n", | |
"pagecounts-20160124-120000.gz, size 96M\n", | |
"pagecounts-20160124-130000.gz, size 93M\n", | |
"pagecounts-20160124-140000.gz, size 98M\n", | |
"pagecounts-20160124-150000.gz, size 99M\n", | |
"pagecounts-20160124-160000.gz, size 98M\n", | |
"pagecounts-20160124-170000.gz, size 96M\n", | |
"pagecounts-20160124-180000.gz, size 94M\n", | |
"pagecounts-20160124-190000.gz, size 96M\n", | |
"pagecounts-20160124-200000.gz, size 96M\n", | |
"pagecounts-20160124-210000.gz, size 93M\n", | |
"pagecounts-20160124-220000.gz, size 90M\n", | |
"pagecounts-20160124-230000.gz, size 87M\n", | |
"pagecounts-20160125-000000.gz, size 82M\n", | |
"pagecounts-20160125-010000.gz, size 85M\n", | |
"pagecounts-20160125-020000.gz, size 83M\n", | |
"pagecounts-20160125-030000.gz, size 79M\n", | |
"pagecounts-20160125-040000.gz, size 74M\n", | |
"pagecounts-20160125-050000.gz, size 73M\n", | |
"pagecounts-20160125-060000.gz, size 75M\n", | |
"pagecounts-20160125-070000.gz, size 82M\n", | |
"pagecounts-20160125-080000.gz, size 89M\n", | |
"pagecounts-20160125-090000.gz, size 94M\n", | |
"pagecounts-20160125-100000.gz, size 99M\n", | |
"pagecounts-20160125-110000.gz, size 97M\n", | |
"pagecounts-20160125-120000.gz, size 99M\n", | |
"pagecounts-20160125-130000.gz, size 96M\n", | |
"pagecounts-20160125-140000.gz, size 101M\n", | |
"pagecounts-20160125-150000.gz, size 103M\n", | |
"pagecounts-20160125-160000.gz, size 104M\n", | |
"pagecounts-20160125-170000.gz, size 102M\n", | |
"pagecounts-20160125-180000.gz, size 100M\n", | |
"pagecounts-20160125-190000.gz, size 97M\n", | |
"pagecounts-20160125-200000.gz, size 97M\n", | |
"pagecounts-20160125-210000.gz, size 97M\n", | |
"pagecounts-20160125-220000.gz, size 93M\n", | |
"pagecounts-20160125-230000.gz, size 89M\n", | |
"pagecounts-20160126-000000.gz, size 85M\n", | |
"pagecounts-20160126-010000.gz, size 88M\n", | |
"pagecounts-20160126-020000.gz, size 86M\n", | |
"pagecounts-20160126-030000.gz, size 83M\n", | |
"pagecounts-20160126-040000.gz, size 83M\n", | |
"pagecounts-20160126-050000.gz, size 82M\n", | |
"pagecounts-20160126-060000.gz, size 84M\n", | |
"pagecounts-20160126-070000.gz, size 85M\n", | |
"pagecounts-20160126-080000.gz, size 90M\n", | |
"pagecounts-20160126-090000.gz, size 94M\n", | |
"pagecounts-20160126-100000.gz, size 95M\n", | |
"pagecounts-20160126-110000.gz, size 95M\n", | |
"pagecounts-20160126-120000.gz, size 96M\n", | |
"pagecounts-20160126-130000.gz, size 93M\n", | |
"pagecounts-20160126-140000.gz, size 98M\n", | |
"pagecounts-20160126-150000.gz, size 100M\n", | |
"pagecounts-20160126-160000.gz, size 100M\n", | |
"pagecounts-20160126-170000.gz, size 99M\n", | |
"pagecounts-20160126-180000.gz, size 98M\n", | |
"pagecounts-20160126-190000.gz, size 94M\n", | |
"pagecounts-20160126-200000.gz, size 96M\n", | |
"pagecounts-20160126-210000.gz, size 94M\n", | |
"pagecounts-20160126-220000.gz, size 90M\n", | |
"pagecounts-20160126-230000.gz, size 88M\n", | |
"pagecounts-20160127-000000.gz, size 84M\n", | |
"pagecounts-20160127-010000.gz, size 87M\n", | |
"pagecounts-20160127-020000.gz, size 81M\n", | |
"pagecounts-20160127-030000.gz, size 81M\n", | |
"pagecounts-20160127-040000.gz, size 78M\n", | |
"pagecounts-20160127-050000.gz, size 79M\n", | |
"pagecounts-20160127-060000.gz, size 81M\n", | |
"pagecounts-20160127-070000.gz, size 82M\n", | |
"pagecounts-20160127-080000.gz, size 86M\n", | |
"pagecounts-20160127-090000.gz, size 91M\n", | |
"pagecounts-20160127-100000.gz, size 96M\n", | |
"pagecounts-20160127-110000.gz, size 96M\n", | |
"pagecounts-20160127-120000.gz, size 95M\n", | |
"pagecounts-20160127-130000.gz, size 94M\n", | |
"pagecounts-20160127-140000.gz, size 95M\n", | |
"pagecounts-20160127-150000.gz, size 97M\n", | |
"pagecounts-20160127-160000.gz, size 96M\n", | |
"pagecounts-20160127-170000.gz, size 96M\n", | |
"pagecounts-20160127-180000.gz, size 92M\n", | |
"pagecounts-20160127-190000.gz, size 92M\n", | |
"pagecounts-20160127-200000.gz, size 90M\n", | |
"pagecounts-20160127-210000.gz, size 90M\n", | |
"pagecounts-20160127-220000.gz, size 89M\n", | |
"pagecounts-20160127-230000.gz, size 86M\n", | |
"pagecounts-20160128-000000.gz, size 80M\n", | |
"pagecounts-20160128-010000.gz, size 82M\n", | |
"pagecounts-20160128-020000.gz, size 82M\n", | |
"pagecounts-20160128-030000.gz, size 81M\n", | |
"pagecounts-20160128-040000.gz, size 80M\n", | |
"pagecounts-20160128-050000.gz, size 78M\n", | |
"pagecounts-20160128-060000.gz, size 80M\n", | |
"pagecounts-20160128-070000.gz, size 83M\n", | |
"pagecounts-20160128-080000.gz, size 88M\n", | |
"pagecounts-20160128-090000.gz, size 94M\n", | |
"pagecounts-20160128-100000.gz, size 100M\n", | |
"pagecounts-20160128-110000.gz, size 100M\n", | |
"pagecounts-20160128-120000.gz, size 100M\n", | |
"pagecounts-20160128-130000.gz, size 97M\n", | |
"pagecounts-20160128-140000.gz, size 96M\n", | |
"pagecounts-20160128-150000.gz, size 99M\n", | |
"pagecounts-20160128-160000.gz, size 99M\n", | |
"pagecounts-20160128-170000.gz, size 97M\n", | |
"pagecounts-20160128-180000.gz, size 95M\n", | |
"pagecounts-20160128-190000.gz, size 96M\n", | |
"pagecounts-20160128-200000.gz, size 96M\n", | |
"pagecounts-20160128-210000.gz, size 96M\n", | |
"pagecounts-20160128-220000.gz, size 94M\n", | |
"pagecounts-20160128-230000.gz, size 93M\n", | |
"pagecounts-20160129-000000.gz, size 87M\n", | |
"pagecounts-20160129-010000.gz, size 90M\n", | |
"pagecounts-20160129-020000.gz, size 87M\n", | |
"pagecounts-20160129-030000.gz, size 86M\n", | |
"pagecounts-20160129-040000.gz, size 84M\n", | |
"pagecounts-20160129-050000.gz, size 85M\n", | |
"pagecounts-20160129-060000.gz, size 83M\n", | |
"pagecounts-20160129-070000.gz, size 87M\n", | |
"pagecounts-20160129-080000.gz, size 91M\n", | |
"pagecounts-20160129-090000.gz, size 94M\n", | |
"pagecounts-20160129-100000.gz, size 99M\n", | |
"pagecounts-20160129-110000.gz, size 96M\n", | |
"pagecounts-20160129-120000.gz, size 97M\n", | |
"pagecounts-20160129-130000.gz, size 96M\n", | |
"pagecounts-20160129-140000.gz, size 100M\n", | |
"pagecounts-20160129-150000.gz, size 98M\n", | |
"pagecounts-20160129-160000.gz, size 97M\n", | |
"pagecounts-20160129-170000.gz, size 97M\n", | |
"pagecounts-20160129-180000.gz, size 94M\n", | |
"pagecounts-20160129-190000.gz, size 97M\n", | |
"pagecounts-20160129-200000.gz, size 97M\n", | |
"pagecounts-20160129-210000.gz, size 95M\n", | |
"pagecounts-20160129-220000.gz, size 89M\n", | |
"pagecounts-20160129-230000.gz, size 83M\n", | |
"pagecounts-20160130-000000.gz, size 79M\n", | |
"pagecounts-20160130-010000.gz, size 81M\n", | |
"pagecounts-20160130-020000.gz, size 79M\n", | |
"pagecounts-20160130-030000.gz, size 72M\n", | |
"pagecounts-20160130-040000.gz, size 70M\n", | |
"pagecounts-20160130-050000.gz, size 71M\n", | |
"pagecounts-20160130-060000.gz, size 72M\n", | |
"pagecounts-20160130-070000.gz, size 73M\n", | |
"pagecounts-20160130-080000.gz, size 77M\n", | |
"pagecounts-20160130-090000.gz, size 81M\n", | |
"pagecounts-20160130-100000.gz, size 87M\n", | |
"pagecounts-20160130-110000.gz, size 90M\n", | |
"pagecounts-20160130-120000.gz, size 95M\n", | |
"pagecounts-20160130-130000.gz, size 94M\n", | |
"pagecounts-20160130-140000.gz, size 94M\n", | |
"pagecounts-20160130-150000.gz, size 96M\n", | |
"pagecounts-20160130-160000.gz, size 99M\n", | |
"pagecounts-20160130-170000.gz, size 96M\n", | |
"pagecounts-20160130-180000.gz, size 92M\n", | |
"pagecounts-20160130-190000.gz, size 89M\n", | |
"pagecounts-20160130-200000.gz, size 89M\n", | |
"pagecounts-20160130-210000.gz, size 91M\n", | |
"pagecounts-20160130-220000.gz, size 88M\n", | |
"pagecounts-20160130-230000.gz, size 85M\n", | |
"pagecounts-20160131-000000.gz, size 83M\n", | |
"pagecounts-20160131-010000.gz, size 87M\n", | |
"pagecounts-20160131-020000.gz, size 86M\n", | |
"pagecounts-20160131-030000.gz, size 84M\n", | |
"pagecounts-20160131-040000.gz, size 81M\n", | |
"pagecounts-20160131-050000.gz, size 79M\n", | |
"pagecounts-20160131-060000.gz, size 83M\n", | |
"pagecounts-20160131-070000.gz, size 79M\n", | |
"pagecounts-20160131-080000.gz, size 81M\n", | |
"pagecounts-20160131-090000.gz, size 88M\n", | |
"pagecounts-20160131-100000.gz, size 93M\n", | |
"pagecounts-20160131-110000.gz, size 98M\n", | |
"pagecounts-20160131-120000.gz, size 98M\n", | |
"pagecounts-20160131-130000.gz, size 97M\n", | |
"pagecounts-20160131-140000.gz, size 98M\n", | |
"pagecounts-20160131-150000.gz, size 99M\n", | |
"pagecounts-20160131-160000.gz, size 102M\n", | |
"pagecounts-20160131-170000.gz, size 107M\n", | |
"pagecounts-20160131-180000.gz, size 106M\n", | |
"pagecounts-20160131-190000.gz, size 104M\n", | |
"pagecounts-20160131-200000.gz, size 101M\n", | |
"pagecounts-20160131-210000.gz, size 98M\n", | |
"pagecounts-20160131-220000.gz, size 92M\n", | |
"pagecounts-20160131-230000.gz, size 89M\n", | |
"'''" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "cfae8e3a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests, os\n", | |
"\n", | |
"def download_pagecounts():\n", | |
" out_dir = os.path.join('data', 'pagecounts')\n", | |
" if not os.path.exists(out_dir): os.makedirs(out_dir)\n", | |
" for i in links.split('\\n'):\n", | |
" if i.strip():\n", | |
" filename = i.split()[0].rstrip(',')\n", | |
" url = 'https://dumps.wikimedia.org/other/pagecounts-raw/2016/2016-01/{}'.format(filename)\n", | |
" open(os.path.join(out_dir, filename), 'wb').write(requests.get(url, allow_redirects=True).content)\n", | |
" for i in glob.glob(os.path.join('data', 'pagecounts', '*')):\n", | |
" os.system('gunzip ' + i)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "d6e4dd76", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import hashlib\n", | |
"import shutil\n", | |
"import glob\n", | |
"import json\n", | |
"import html\n", | |
"import time\n", | |
"import wget\n", | |
"import os\n", | |
"\n", | |
"def get_md5(s):\n", | |
" s = '_'.join(s.split())\n", | |
" s = html.unescape(s)\n", | |
" s = format_page_name(s)\n", | |
" m = hashlib.md5()\n", | |
" m.update(s.encode('utf8'))\n", | |
" return str(m.hexdigest())\n", | |
"\n", | |
"# columns represent: language/project, title, views, content size\n", | |
"for i in dataset: dataset[i]['views'] = 0\n", | |
" \n", | |
"for day in range(1, 32, 1):\n", | |
" day = str(day)\n", | |
" if len(day) < 2: day = '0' + day\n", | |
" print(' * processing day', day)\n", | |
" files = glob.glob(os.path.join('data', 'pagecounts', 'pagecounts-201601{}*'.format(day)))\n", | |
" for idx, i in enumerate(files):\n", | |
" with open(i) as f:\n", | |
" for j in f:\n", | |
" if not j.strip(): continue\n", | |
" try:\n", | |
" lang, name, views, size = j.split()\n", | |
" except ValueError:\n", | |
" continue\n", | |
" views = int(views)\n", | |
" if lang == 'en' and name in dataset:\n", | |
" dataset[name]['views'] += views" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f6e25562", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"with open(os.path.join('data', 'json', 'pagecounts.json'), 'w') as out:\n", | |
" json.dump(d, out)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "05d4ee04", | |
"metadata": {}, | |
"source": [ | |
"# Unused: Collect all page images" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "df9593bb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import hashlib\n", | |
"import shutil\n", | |
"import glob\n", | |
"import json\n", | |
"import time\n", | |
"import wget\n", | |
"import os\n", | |
"\n", | |
"image_urls = {}\n", | |
"\n", | |
"def get_md5(s):\n", | |
" m = hashlib.md5()\n", | |
" m.update(s.encode('utf8'))\n", | |
" return str(m.hexdigest())\n", | |
"\n", | |
"def get_image_urls(filename):\n", | |
" filename = filename.replace('.svg', '.png')\n", | |
" filename = '_'.join(filename.split())\n", | |
" filename = html.unescape(filename)\n", | |
" hashed = get_md5(filename)\n", | |
" fullsize_url = 'https://upload.wikimedia.org/wikipedia/commons/{}/{}/{}'.format(\n", | |
" hashed[0],\n", | |
" hashed[:2],\n", | |
" filename\n", | |
" )\n", | |
" thumb_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/{}/{}/{}/200px-{}'.format(\n", | |
" hashed[0],\n", | |
" hashed[:2],\n", | |
" filename,\n", | |
" filename\n", | |
" )\n", | |
" return {\n", | |
" 'full': fullsize_url,\n", | |
" 'thumb': thumb_url,\n", | |
" }\n", | |
"\n", | |
"def get_selected_image_urls():\n", | |
" image_urls = {}\n", | |
" for idx, i in enumerate(sorted(d.keys(), reverse=True)):\n", | |
" print(' * processing bucket', i)\n", | |
" if i < 40: break\n", | |
" for name in d[i]:\n", | |
" k = dataset[name]['raw']\n", | |
" article_images = []\n", | |
" # find the first image in the article\n", | |
" files = k.split('File:') \n", | |
" if len(files) > 0:\n", | |
" for file in files[1:]:\n", | |
" filename = file.split('|')[0].split(']')[0]\n", | |
" urls = get_image_urls(filename)\n", | |
" article_images.append(urls)\n", | |
" image_urls[name] = article_images\n", | |
" with open(os.path.join('data', 'json', 'image_urls.json'), 'w') as out:\n", | |
" json.dump(image_urls, out)\n", | |
" \n", | |
"def download_image(url, filename):\n", | |
" return os.system('wget \"{}\" -O \"{}\" -q'.format(url, filename))\n", | |
" \n", | |
"def download_images():\n", | |
" image_urls = json.load(open(os.path.join('data', 'json', 'image_urls.json')))\n", | |
" for name_idx, name in enumerate(image_urls):\n", | |
" urls = image_urls[name]\n", | |
" if not urls: continue\n", | |
" for url_idx, url in enumerate(urls[:10]):\n", | |
" url = '_'.join(url['full'].split())\n", | |
" ext = url.split('.')[-1]\n", | |
" if url.lower().endswith('.jpg') or url.lower().endswith('.png'):\n", | |
" try:\n", | |
" filename = os.path.join('data', 'images', format_page_name(name) + '-{}.jpg'.format(url_idx))\n", | |
" if not os.path.exists(filename):\n", | |
" download_image(url, filename)\n", | |
" except:\n", | |
" pass\n", | |
" \n", | |
"#get_selected_image_urls()\n", | |
"#download_images()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment