Sharvari Dhote sharvaridhote

## test.py
# Testing the model = https://en.wikipedia.org/wiki/Che_Guevara - positive sentence
loaded_model = spacy.load('model_artifactnewdatatest1LR0.01L22E-4')
test_text= "Such positions also allowed him to play a central role in training the militia forces who repelled the Bay of Pigs Invasion and bringing the Soviet nuclear-armed ballistic missiles to Cuba which precipitated the 1962 Cuban Missile Crisis "
doc=loaded_model(test_text)
doc.cats

## evaluate.py
def evaluate(tokenizer, textcat, texts, cats):
    """
    Evaluate the performance of TextCategoriser prediction
    Calculate accuracy, f1 score, precision, recall
    parameters:
        nlp: object - spacy
        textcat: TextCategoriser
        texts : input text to be evaluated
        cats : input label
    """

## app.py
def load_model():
    # declare global variables
    global nlp
    global textcat

nlp = spacy.load('C:/Project')  ## will load the model from the model_path
textcat = nlp.get_pipe('textcat')  ## will load the model file

def main():
    """Wikipedia Citation Needed NLP app with Spacy-Streamlit"""

## train.py
def training(train_texts, train_cats, dev_texts, dev_cats, test_texts, test_cats, L2, learn_rate, n_iter,  output_dir=None):
    """
       Spacy example function modified
       Trains citation needed classifier and saves model
       Parameters:
           train_texts :str -list - text train features
           train_cats :str - list - label citation sentence - TRUE else FALSE
           dev_texts :str - list - text train features
           dev_cats :str - list - label citation sentence - TRUE else FALSE
           test_texts :str - list - text train features

## label_creator.py
def label_creator(x):
    """
    Find and remove citation from the text and creates labels
    parameters:
        x : str - charecters in the string
    returns :
        cleanx : str - cleaned text without citation
        label : int - sentence with citation: 1 else 0
    """
    infix = re.compile('\[(.+?)\]')

## load_data.py
def load_data(df, split=0.2):
    """
    Function From Spacy
    Prepare the training data as per Spacy format
    Parameters:
        df: training data in pandas dataframe
        split: float - Splitting dataframe to train and validation set. Defaults to 0.2
    Returns:
        tuples: train and validation text and labels
    """

## sentencer.py
def custom_sentence_boundary(doc):
    #  function to split sentences at the end of citation bracket and no splitting at some other charecters
    for i, token in enumerate(doc):
        if token.text == ']':
            doc[i + 1].sent_start = True
    return doc

def sentence_tokenization(text_batches):
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(custom_sentence_boundary, before='parser')

## scraper.py
# crawling website
def getLinks(url):
    html_page = urlopen(url)
    soup = BeautifulSoup(html_page)
    total_pages = []
    try:
        for link in soup.find_all('a', href=True):
            if link.get('href') not in total_pages:
                total_pages.append(link.get('href'))
    except:
	# Testing the model = https://en.wikipedia.org/wiki/Che_Guevara - positive sentence
	loaded_model = spacy.load('model_artifactnewdatatest1LR0.01L22E-4')
	test_text= "Such positions also allowed him to play a central role in training the militia forces who repelled the Bay of Pigs Invasion and bringing the Soviet nuclear-armed ballistic missiles to Cuba which precipitated the 1962 Cuban Missile Crisis "
	doc=loaded_model(test_text)
	doc.cats
	def evaluate(tokenizer, textcat, texts, cats):
	"""
	Evaluate the performance of TextCategoriser prediction
	Calculate accuracy, f1 score, precision, recall
	parameters:
	nlp: object - spacy
	textcat: TextCategoriser
	texts : input text to be evaluated
	cats : input label
	"""
	def load_model():
	# declare global variables
	global nlp
	global textcat

	nlp = spacy.load('C:/Project') ## will load the model from the model_path
	textcat = nlp.get_pipe('textcat') ## will load the model file

	def main():
	"""Wikipedia Citation Needed NLP app with Spacy-Streamlit"""
	def training(train_texts, train_cats, dev_texts, dev_cats, test_texts, test_cats, L2, learn_rate, n_iter, output_dir=None):
	"""
	Spacy example function modified
	Trains citation needed classifier and saves model
	Parameters:
	train_texts :str -list - text train features
	train_cats :str - list - label citation sentence - TRUE else FALSE
	dev_texts :str - list - text train features
	dev_cats :str - list - label citation sentence - TRUE else FALSE
	test_texts :str - list - text train features
	def label_creator(x):
	"""
	Find and remove citation from the text and creates labels
	parameters:
	x : str - charecters in the string
	returns :
	cleanx : str - cleaned text without citation
	label : int - sentence with citation: 1 else 0
	"""
	infix = re.compile('\[(.+?)\]')
	def load_data(df, split=0.2):
	"""
	Function From Spacy
	Prepare the training data as per Spacy format
	Parameters:
	df: training data in pandas dataframe
	split: float - Splitting dataframe to train and validation set. Defaults to 0.2
	Returns:
	tuples: train and validation text and labels
	"""
	def custom_sentence_boundary(doc):
	# function to split sentences at the end of citation bracket and no splitting at some other charecters
	for i, token in enumerate(doc):
	if token.text == ']':
	doc[i + 1].sent_start = True
	return doc

	def sentence_tokenization(text_batches):
	nlp = spacy.load('en_core_web_sm')
	nlp.add_pipe(custom_sentence_boundary, before='parser')
	# crawling website
	def getLinks(url):
	html_page = urlopen(url)
	soup = BeautifulSoup(html_page)
	total_pages = []
	try:
	for link in soup.find_all('a', href=True):
	if link.get('href') not in total_pages:
	total_pages.append(link.get('href'))
	except: