Skip to content

Instantly share code, notes, and snippets.

@snakers4
Last active January 4, 2023 22:19
Show Gist options
  • Star 13 You must be signed in to star a gist
  • Fork 7 You must be signed in to fork a gist
  • Save snakers4/e0b0e68904db65671ca979639b337f7b to your computer and use it in GitHub Desktop.
Save snakers4/e0b0e68904db65671ca979639b337f7b to your computer and use it in GitHub Desktop.
Post process wikipedia files produced by wikiextractor
import os
import re
import sys
import glob
import nltk
import gensim
import numpy as np
import pandas as pd
from tqdm import tqdm
from uuid import uuid4
from functools import reduce
from multiprocessing import Pool
from nltk.corpus import stopwords
def _remove_non_printed_chars(string):
reg = re.compile('[^a-zA-Zа-яА-ЯёЁ]')
return reg.sub(' ', string)
def _remove_stop_words(string,sw=[]):
return ' '.join([word if word not in sw else '' \
for word in string.strip().split(' ')])
def _trim_string(string):
# remove extra spaces, remove trailing spaces, lower the case
return re.sub('\s+',' ',string).strip().lower()
def clean_string(string,
stop_words_list,
min_len=2,
max_len=30):
string = _remove_non_printed_chars(string)
string = _remove_stop_words(string,stop_words_list)
string = _trim_string(string)
# also remove short words, most likely containing addresses / crap / left-overs / etc remaining after removal
# gensim mostly does the same as above, it is used here for simplicity
string = ' '.join(gensim.utils.simple_preprocess(string,
min_len=min_len,
max_len=max_len))
return string
def splitkeepsep(s, sep):
cleaned = []
s = re.split("(%s)" % re.escape(sep), s)
for _ in s:
if _!='' and _!=sep:
cleaned.append(sep+_)
return cleaned
def remove_html_tags(text):
"""Remove html tags from a string"""
import re
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
def remove_special_chars(text,char_list):
for char in char_list:
text=text.replace(char,'')
return text.replace(u'\xa0', u' ')
def process_wiki_files(wiki_file):
chars = ['\n']
global sw
with open(wiki_file, encoding='utf-8') as f:
content = f.read()
articles = splitkeepsep(content,'<doc id=')
df = pd.DataFrame(columns=['article_uuid','sentence','proc_sentence','proc_len'])
for article in articles:
uuid = uuid4()
article = remove_special_chars(remove_html_tags(article),
chars)
sentences = nltk.sent_tokenize(article)
proc_sentences = [clean_string(sentence,sw) for sentence in sentences]
proc_lens = [len(sentence.split(' ')) for sentence in proc_sentences]
temp_df = pd.DataFrame(
{'article_uuid': [uuid]*len(sentences),
'sentence': sentences,
'proc_sentence':proc_sentences,
'proc_len':proc_lens
})
df = df.append(temp_df)
return df
def list_multiprocessing(param_lst,
func,
**kwargs):
workers = kwargs.pop('workers')
with Pool(workers) as p:
apply_lst = [([params], func, i, kwargs) for i,params in enumerate(param_lst)]
result = list(tqdm(p.imap(_apply_lst, apply_lst), total=len(apply_lst)))
# lists do not need such sorting, but this can be useful later
result=sorted(result,key=lambda x:x[0])
return [_[1] for _ in result]
def _apply_lst(args):
params, func, num, kwargs = args
return num, func(*params,**kwargs)
wiki_files = []
for filename in glob.iglob('data/wiki/*/*', recursive=True):
wiki_files.append(filename)
# plain list of stop words
sw_en = set(stopwords.words('english'))
sw_ru = set(stopwords.words('russian'))
sw = list(sw_ru.union(sw_en))
df = list_multiprocessing(wiki_files,
process_wiki_files,
workers=4)
df = pd.concat(df).reset_index(drop=True)
df.article_uuid = df.article_uuid.astype(str)
df.to_csv('data/ruwiki_2018_09_25.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment