Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
GC4 Corpus Filtering Scripts
import json
import gzip
import pathlib
import os
import pdb
from ast import literal_eval
from tqdm import tqdm
if __name__ == '__main__':
parent_dir = pathlib.Path("data_head_url")
for file in tqdm(parent_dir.iterdir()):
with gzip.open(file,'rt') as f:
a = f.readline()
a = a.split("{'url'")
a = [("{'url'" + item) for item in a]
b = []
for item in tqdm(a):
try:
if literal_eval(item)['language_score'] > 0.98:
b.append(literal_eval(item))
except:
None
with gzip.open(f"{file.name}_filtered.tar.gz", 'wt') as file_new:
for part in a[1:]:
file_new.write(part + '\n')
@PhilipMay

This comment has been minimized.

Copy link

@PhilipMay PhilipMay commented Sep 14, 2021

literal_eval(item) in line 22 and 23 should only be called once.

@PhilipMay

This comment has been minimized.

Copy link

@PhilipMay PhilipMay commented Sep 15, 2021

I saw that this extracts the gz part but not the tar part. IMO something like this is better to get the content:

file = "de_head_0007_2020-10.tar.gz"
with tarfile.open(file, "r:gz") as tar_file:
    members = tar_file.getmembers()
    print(members)

    assert len(members) == 1

    file = tar_file.extractfile(members[0])
    file_content = file.read().decode('utf-8')

    print(file_content[:2000])
@PhilipMay

This comment has been minimized.

Copy link

@PhilipMay PhilipMay commented Sep 15, 2021

Line 28 writes a but before that the script filters a to generate b.
The script should somehow write b.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment