This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Testing the model = https://en.wikipedia.org/wiki/Che_Guevara - positive sentence | |
loaded_model = spacy.load('model_artifactnewdatatest1LR0.01L22E-4') | |
test_text= "Such positions also allowed him to play a central role in training the militia forces who repelled the Bay of Pigs Invasion and bringing the Soviet nuclear-armed ballistic missiles to Cuba which precipitated the 1962 Cuban Missile Crisis " | |
doc=loaded_model(test_text) | |
doc.cats |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def evaluate(tokenizer, textcat, texts, cats): | |
""" | |
Evaluate the performance of TextCategoriser prediction | |
Calculate accuracy, f1 score, precision, recall | |
parameters: | |
nlp: object - spacy | |
textcat: TextCategoriser | |
texts : input text to be evaluated | |
cats : input label | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def load_model(): | |
# declare global variables | |
global nlp | |
global textcat | |
nlp = spacy.load('C:/Project') ## will load the model from the model_path | |
textcat = nlp.get_pipe('textcat') ## will load the model file | |
def main(): | |
"""Wikipedia Citation Needed NLP app with Spacy-Streamlit""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def training(train_texts, train_cats, dev_texts, dev_cats, test_texts, test_cats, L2, learn_rate, n_iter, output_dir=None): | |
""" | |
Spacy example function modified | |
Trains citation needed classifier and saves model | |
Parameters: | |
train_texts :str -list - text train features | |
train_cats :str - list - label citation sentence - TRUE else FALSE | |
dev_texts :str - list - text train features | |
dev_cats :str - list - label citation sentence - TRUE else FALSE | |
test_texts :str - list - text train features |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def label_creator(x): | |
""" | |
Find and remove citation from the text and creates labels | |
parameters: | |
x : str - charecters in the string | |
returns : | |
cleanx : str - cleaned text without citation | |
label : int - sentence with citation: 1 else 0 | |
""" | |
infix = re.compile('\[(.+?)\]') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def load_data(df, split=0.2): | |
""" | |
Function From Spacy | |
Prepare the training data as per Spacy format | |
Parameters: | |
df: training data in pandas dataframe | |
split: float - Splitting dataframe to train and validation set. Defaults to 0.2 | |
Returns: | |
tuples: train and validation text and labels | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def custom_sentence_boundary(doc): | |
# function to split sentences at the end of citation bracket and no splitting at some other charecters | |
for i, token in enumerate(doc): | |
if token.text == ']': | |
doc[i + 1].sent_start = True | |
return doc | |
def sentence_tokenization(text_batches): | |
nlp = spacy.load('en_core_web_sm') | |
nlp.add_pipe(custom_sentence_boundary, before='parser') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# crawling website | |
def getLinks(url): | |
html_page = urlopen(url) | |
soup = BeautifulSoup(html_page) | |
total_pages = [] | |
try: | |
for link in soup.find_all('a', href=True): | |
if link.get('href') not in total_pages: | |
total_pages.append(link.get('href')) | |
except: |