Skip to content

Instantly share code, notes, and snippets.

View Nithilaa's full-sized avatar

Nithilaa Umasankar Nithilaa

  • Coimbatore
View GitHub Profile
pip install lexrank
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer_lex = LexRankSummarizer()
summary = summarizer_lex(parser.document, 2)
lex_summary=""
for sentence in summary:
lex_summary += str(sentence)
lex_summary += "\n"
text = """America has changed dramatically during recent years.
Not only has the number of graduates in traditional engineering disciplines such as mechanical, civil, electrical, chemical, and aeronautical engineering declined, but in most of the premier American universities engineering curricula now concentrate on and encourage largely the study of engineering science.
As a result, there are declining offerings in engineering subjects dealing with infrastructure, the environment, and related issues, and greater concentration on high technology subjects, largely supporting increasingly complex scientific developments.
While the latter is important, it should not be at the expense of more traditional engineering.
Rapidly developing economies such as China and India, as well as other industrial countries in Europe and Asia, continue to encourage and advance the teaching of engineering.
Both China and India, respectively, graduate six and eight times as many traditional engineers as does the United States.
import nltk
nltk.download('punkt')
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
pip install sumy
aug2 = naf.Sometimes([
aug_bert,aug_w2v
],aug_p=0.5, pipeline_p=0.5)
aug2.augment(text, n=10) # Results look better than sequential
# try Compose & Sometimes
# make offline augmentation + pseudo label from my best ensemble
# re-train
text = "What is your recommended book on Bayesian Statistics?"
aug = naf.Sequential([
aug_bert,aug_w2v
])
# model_type: word2vec, glove or fasttext
aug_w2v = naw.WordEmbsAug(
model_type='glove', model_path='/content/glove.6B.300d.txt',
action="substitute")
print("Original:")
print(test_sentence)
TOPK=20 #default=100
ACT = 'insert' #"substitute"
aug_bert = naw.ContextualWordEmbsAug(
model_path='distilbert-base-uncased',
#device='cuda',
action=ACT, top_k=TOPK)
print("Original:")
print(test_sentence)
print("Augmented Text:")
aug = naw.SplitAug(name='Split_Aug', aug_min=1, aug_max=10, aug_p=0.3, min_char=4, stopwords=None, tokenizer=None,
reverse_tokenizer=None, stopwords_regex=None, verbose=0)
test_sentence_aug = aug.augment(test_sentence)
print(test_sentence)
print(test_sentence_aug)