Skip to content

Instantly share code, notes, and snippets.

View suryavanshi's full-sized avatar

Manu Suryavansh suryavanshi

View GitHub Profile
from transformers import pipeline
generator = pipeline('text-generation', model='gpt2')
input_text = "I went to see a movie in the theater"
input_length = len(input_text.split())
num_new_words = 5
output_length = input_length + num_new_words
gpt_output = generator(input_text, max_length=output_length, num_return_sequences=5)
augmented_text = gpt_output[0]['generated_text']
print("Augmented text->",augmented_text)
from transformers import pipeline
import random
unmasker = pipeline('fill-mask', model='bert-base-cased')
input_text = "I went to see a movie in the theater"
orig_text_list = input_text.split()
len_input = len(orig_text_list)
#Random index where we want to replace the word
from transformers import pipeline
import random
unmasker = pipeline('fill-mask', model='bert-base-cased')
input_text = "I went to see a movie in the theater"
orig_text_list = input_text.split()
len_input = len(orig_text_list)
#Random index where we want to insert the word except at the start or end
@suryavanshi
suryavanshi / back_translate.py
Created August 13, 2021 09:43
back_translate.py
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
#English to German using the Pipeline and T5
translator_en_to_de = pipeline("translation_en_to_de", model='t5-base')
#Germal to English using Bert2Bert model
tokenizer = AutoTokenizer.from_pretrained("google/bert2bert_L-24_wmt_de_en", pad_token="<pad>", eos_token="</s>", bos_token="<s>")
model_de_to_en = AutoModelForSeq2SeqLM.from_pretrained("google/bert2bert_L-24_wmt_de_en")
@suryavanshi
suryavanshi / presidio_pii.py
Last active May 19, 2021 08:09
presidio_pii.py
#From - https://microsoft.github.io/presidio/getting_started/
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
text="My phone number is 212-555-5555"
# Set up the engine, loads the NLP module (spaCy model by default)
# and other PII recognizers
analyzer = AnalyzerEngine()
import spacy #Using Spacy version 2.2.3
nlp = spacy.load("en_core_web_lg")
inp_text = "My name is John Wick, I live in California"
doc = nlp(inp_text)
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
new_tokens = []
for token in doc: