Skip to content

Instantly share code, notes, and snippets.

@ymoslem
Created December 2, 2021 14:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ymoslem/b8af6d69a812e1d0662709d84b7e15fa to your computer and use it in GitHub Desktop.
Save ymoslem/b8af6d69a812e1d0662709d84b7e15fa to your computer and use it in GitHub Desktop.
# https://webz.io/free-datasets/
# Spanish: https://s3.amazonaws.com/webhose-archive/datasets/645_20170904091816.zip
# Extract text from the JSON files
import os
import json
from sentence_splitter import split_text_into_sentences
from tqdm import tqdm
filenames = []
root = "."
for item in os.listdir(root):
if os.path.isfile(os.path.join(root, item)):
if item.endswith(".json"):
filenames.append(item)
filenames.sort()
print("Number of files:", len(filenames))
count = 0
for filename in tqdm(filenames):
with open(filename, "r") as jsn, open("spanish-news-corpus.es", "a") as outputfilename:
output = json.load(jsn)
text = output["text"]
lines = split_text_into_sentences(text=text, language="es")
outputfilename.write("\n".join(line.strip() for line in lines) + "\n")
count += len(lines)
print("Number of lines", count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment