Skip to content

Instantly share code, notes, and snippets.

View erickrf's full-sized avatar

Erick Fonseca erickrf

View GitHub Profile
@erickrf
erickrf / json2txt.py
Created August 19, 2019 21:04
Script to join text from JSON files for training GPT-2
import json
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('input_dir', help='Directory with wiki json files')
parser.add_argument('output', help='Txt file output')
args = parser.parse_args()
for filename in os.listdir(args.input_dir):
@erickrf
erickrf / find-ambiguous-lemmas.py
Created March 11, 2019 23:35
Script to find combinations of word and tags that have more than one lemma in the UD treebanks
import argparse
from collections import defaultdict
parser = argparse.ArgumentParser()
parser.add_argument('input',
help='Input file in CoNLLU format')
parser.add_argument('-u', action='store_true', dest='upos',
help='Use UPOS to disambiguate')
parser.add_argument('-x', action='store_true', dest='xpos',
help='Use XPOS to disambiguate')
@erickrf
erickrf / openwordnetpt.py
Last active October 23, 2018 20:53
Functions to access the OpenWordnetPT graph
# -*- coding: utf-8 -*-
'''
Functions to read the OpenWordnetPT from RDF files and provide
access to it.
'''
import rdflib
from six.moves import cPickle
@erickrf
erickrf / read_embeddings.py
Last active December 18, 2023 09:13
Read embeddings file in text format and convert to numpy
import numpy as np
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('input', help='Single embedding file')
parser.add_argument('output', help='Output basename without extension')
args = parser.parse_args()
embeddings_file = args.output + '.npy'
vocabulary_file = args.output + '.txt'
@erickrf
erickrf / tokenizer.py
Last active March 5, 2023 05:12
Portuguese tokenizer
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from nltk.tokenize import RegexpTokenizer
import argparse
import os
"""
Script for tokenizing Portuguese text according to the Universal Dependencies
(UD) tokenization standards. This script was not created by the UD team; it was