derlin/get_text.py

## get_text.py
#!/usr/bin/env python

"""
Usage:

1. clone https://github.com/attardi/wikiextractor
2. download a pages-articles.xml.bz2 archive from WikiMedia dumps
3. use wikiextractor to convert the XML articles into JSON:
    ```
    python WikiExtractor.py -b 100M alswiki-latest-pages-articles.xml.bz2
    ```
4. use this script to extract article lines into a text file:
    ```
    python get_text.py text/AA/wiki_00 text_only.txt
    ```
"""

import json
import sys
from io import open

if __name__ == '__main__':

    if len(sys.argv) < 2:
        print(f'Usage: {sys.argv[0]} json-file')
        sys.exit(-1)

    if len(sys.argv) >= 3:
        outfile = sys.argv[2]
    else:
        outfile = 'sentences.txt'

    with open(sys.argv[1], encoding='utf-8') as fin:
        with open(outfile, 'w', encoding='utf-8') as fout:
            for line in fin.readlines():
                j = json.loads(line)
                if not j['title'].startswith('MediaWiki:') and \
                    len(j['text']) > 20:
                    fout.write(j['text'])
    print('done')
	#!/usr/bin/env python

	"""
	Usage:

	1. clone https://github.com/attardi/wikiextractor
	2. download a pages-articles.xml.bz2 archive from WikiMedia dumps
	3. use wikiextractor to convert the XML articles into JSON:
	```
	python WikiExtractor.py -b 100M alswiki-latest-pages-articles.xml.bz2
	```
	4. use this script to extract article lines into a text file:
	```
	python get_text.py text/AA/wiki_00 text_only.txt
	```
	"""

	import json
	import sys
	from io import open

	if __name__ == '__main__':

	if len(sys.argv) < 2:
	print(f'Usage: {sys.argv[0]} json-file')
	sys.exit(-1)

	if len(sys.argv) >= 3:
	outfile = sys.argv[2]
	else:
	outfile = 'sentences.txt'

	with open(sys.argv[1], encoding='utf-8') as fin:
	with open(outfile, 'w', encoding='utf-8') as fout:
	for line in fin.readlines():
	j = json.loads(line)
	if not j['title'].startswith('MediaWiki:') and \
	len(j['text']) > 20:
	fout.write(j['text'])
	print('done')