Skip to content

Instantly share code, notes, and snippets.

@mani2106
Last active April 14, 2020 06:39
Show Gist options
  • Save mani2106/97c0af61c9fde6e6cd7f6304f1b593af to your computer and use it in GitHub Desktop.
Save mani2106/97c0af61c9fde6e6cd7f6304f1b593af to your computer and use it in GitHub Desktop.
Extracts data from wikimedia dump
import json
from pathlib import Path
import string
import shutil
import git
import requests as req
import subprocess
import pandas as pd
# output paths, change here
DATA_PATH = Path('/kaggle/working/')
EXTRACTED_PATH = DATA_PATH/'extracted'
EXTRACTED_PATH.mkdir()
# Request file from wikipedia
bzip_file = req.get('https://dumps.wikimedia.org/tawiki/latest/tawiki-latest-pages-articles.xml.bz2')
# save request content to afile
with open(DATA_PATH/'tawiki-latest-pages-articles.xml.bz2', 'wb') as f:
f.write(bzip_file.content)
# clone wiki extractor from github(Thanks, https://github.com/attardi)
git.Git(str(DATA_PATH)).clone("https://github.com/attardi/wikiextractor.git")
# Runs the python code
print('Extracting data from dump archive')
run_stat = subprocess.run(
['python',
# File to run
str(DATA_PATH/'wikiextractor/WikiExtractor.py'),
# Processing parameters get as json
'-s', '--json',
# Directory to store Extracted text
'-o', str(DATA_PATH/'extracted'),
# Archive file to extract from
str(DATA_PATH/'tawiki-latest-pages-articles.xml.bz2')]
)
# Get list of files extracted from the extraction folder
files_extracted = [str(f) for f in EXTRACTED_PATH.rglob("*/*")]
# Since all files are stored as json we can load them like this
# LANG_TEXT = [json.loads(line) for _file in files_extracted for line in open(_file)]
lang_text = []
for _file in files_extracted:
with open(_file, 'r') as f:
file_lines = f.readlines()
for line in file_lines:
lang_text.append(json.loads(line))
# Function to filter english words
# check each word after removing their punctuations
# filter_english = lambda text: ' '.join([word for word in text.split() if word.translate(str.maketrans('', '', string.punctuation)).isalpha() is False])
def filter_english(text):
words = []
for word in text.split():
word = word.translate(str.maketrans('', '', string.punctuation))
if not word.isalpha():
words.append(word)
return ' '.join(words)
lang_df = pd.DataFrame(lang_text)
lang_df['text'] = lang_df['text'].apply(filter_english)
# Check the dataframe structure
print(lang_df.info())
print(lang_df.head())
# Store the output in compressed format
lang_df.to_csv(DATA_PATH/'filtered_data.csv.tar.gz', header=True)
# clean up the downloaded files
shutil.rmtree(str(EXTRACTED_PATH))
shutil.rmtree(str(DATA_PATH/'wikiextractor'))
Path(DATA_PATH/'tawiki-latest-pages-articles.xml.bz2').unlink()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment