Manmohan Singh manmohan24nov

## spacy_keyword_extraction.py
>>> import spacy
>>> nlp = spacy.load("en_core_sci_lg")
>>> text = """spaCy is an open-source software library for advanced natural language processing,
written in the programming languages Python and Cython. The library is published under the MIT license
and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
>>> doc = nlp(text)
>>> print(doc.ents)
(spaCy, open-source software library, written, programming languages,
 Python, Cython, library, MIT, license, developers, Matthew Honnibal,
 Ines, Montani, founders, software company)

## roberta_twitter_comment.py
import pandas as pd
# Recommended tensorflow version is <= 2.1.0, otherwise F1 score function breaks
import tensorflow as tf
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import tensorflow_datasets as tfds
from transformers import TFRobertaForSequenceClassification
from transformers import RobertaTokenizer

# Load your Dataset

## create_small_audio_files.py
from pydub import AudioSegment


mp3_audio = AudioSegment.from_file(r"audio_full.wav", format="wav")
print(len(mp3_audio)/(1000*60))
# 12 Minutes audio breaks into 3 minutes 4 audio files (slicingis done by milliseconds)

counter_audio = 180
split_audio = [mp3_audio[:180*1000]]
for i in range(4):

## speech_to_text_new.py
# A python package for music and audio analysis.
# https://librosa.org/doc/latest/index.html
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# load model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

## sentiment_textblob.py
# PRAW to interact with reddit
import praw
#install textblob if not already installed using "pip install -U textblob"
from textblob import TextBlob
import nltk
# Download VADER, if not downloaded
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# create object for VADER sentiment function interaction

## cluster_reddit_comments.py
>>> import praw
>>> import pandas as pd
>>> from sklearn.cluster import KMeans
>>> from sklearn.feature_extraction.text import TfidfVectorizer
>>> import random
>>> import numpy as np
>>> from transformers import RobertaTokenizer
>>> roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
>>> reddit = praw.Reddit(client_id='client id',
...                      client_secret='client secret',

## reddit_api_k-means.py
>>> import praw
>>> reddit = praw.Reddit(client_id='client id', #1
...                      client_secret='client secret',
...                      user_agent='user agent')

Version 7.1.0 of praw is outdated. Version 7.2.0 was released Wednesday February 24, 2021.
>>> def replies_of(top_level_comment, comment_list): #2
...     if len(top_level_comment.replies) == 0:
...         return
...     else:

## repeated_substring.py
>>> text_data_test = "Life is what life happens when you're busy life making other life plans."
>>> index_list = []
>>> flag = 0
>>> count = 0
>>> word_length = len('life')
>>> while 'life' in text_data_test:
...     return_index = text_data_test.lower().find('life')
...     print(return_index)
...     if return_index == -1:
...             break

## jaccord_substring.py
>>> from nltk.tokenize import word_tokenize
>>> text_data = "Life is what happens when you're busy making other plans."
>>> duplicate_data = "what happens when you're busy"
>>> original_tokens = word_tokenize(text_data)
>>> duplicate_tokens = word_tokenize(duplicate_data)
>>> # Convert all the characters to lower case because this method is case sensitive.
>>> original_tokens = [token.lower() for token in original_tokens]
>>> duplicate_tokens = [token.lower() for token in duplicate_tokens]
>>> original_trigrams = []
>>> for i in range(len(original_tokens) - 2):

## gensim_keyword_extraction.py
>>> from gensim.summarization import keywords
>>> text = """spaCy is an open-source software library for advanced natural language processing,
written in the programming languages Python and Cython. The library is published under the MIT license
and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
>>> print(keywords(text))
language
languages
software
company
	>>> import spacy
	>>> nlp = spacy.load("en_core_sci_lg")
	>>> text = """spaCy is an open-source software library for advanced natural language processing,
	written in the programming languages Python and Cython. The library is published under the MIT license
	and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
	>>> doc = nlp(text)
	>>> print(doc.ents)
	(spaCy, open-source software library, written, programming languages,
	Python, Cython, library, MIT, license, developers, Matthew Honnibal,
	Ines, Montani, founders, software company)
	import pandas as pd
	# Recommended tensorflow version is <= 2.1.0, otherwise F1 score function breaks
	import tensorflow as tf
	from sklearn.metrics import f1_score
	from sklearn.model_selection import train_test_split
	import tensorflow_datasets as tfds
	from transformers import TFRobertaForSequenceClassification
	from transformers import RobertaTokenizer

	# Load your Dataset
	from pydub import AudioSegment


	mp3_audio = AudioSegment.from_file(r"audio_full.wav", format="wav")
	print(len(mp3_audio)/(1000*60))
	# 12 Minutes audio breaks into 3 minutes 4 audio files (slicingis done by milliseconds)

	counter_audio = 180
	split_audio = [mp3_audio[:180*1000]]
	for i in range(4):
	# A python package for music and audio analysis.
	# https://librosa.org/doc/latest/index.html
	import librosa
	import torch
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

	# load model and tokenizer
	tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
	model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
	# PRAW to interact with reddit
	import praw
	#install textblob if not already installed using "pip install -U textblob"
	from textblob import TextBlob
	import nltk
	# Download VADER, if not downloaded
	# nltk.download('vader_lexicon')
	from nltk.sentiment.vader import SentimentIntensityAnalyzer

	# create object for VADER sentiment function interaction
	>>> import praw
	>>> import pandas as pd
	>>> from sklearn.cluster import KMeans
	>>> from sklearn.feature_extraction.text import TfidfVectorizer
	>>> import random
	>>> import numpy as np
	>>> from transformers import RobertaTokenizer
	>>> roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
	>>> reddit = praw.Reddit(client_id='client id',
	... client_secret='client secret',
	>>> text_data_test = "Life is what life happens when you're busy life making other life plans."
	>>> index_list = []
	>>> flag = 0
	>>> count = 0
	>>> word_length = len('life')
	>>> while 'life' in text_data_test:
	... return_index = text_data_test.lower().find('life')
	... print(return_index)
	... if return_index == -1:
	... break
	>>> from nltk.tokenize import word_tokenize
	>>> text_data = "Life is what happens when you're busy making other plans."
	>>> duplicate_data = "what happens when you're busy"
	>>> original_tokens = word_tokenize(text_data)
	>>> duplicate_tokens = word_tokenize(duplicate_data)
	>>> # Convert all the characters to lower case because this method is case sensitive.
	>>> original_tokens = [token.lower() for token in original_tokens]
	>>> duplicate_tokens = [token.lower() for token in duplicate_tokens]
	>>> original_trigrams = []
	>>> for i in range(len(original_tokens) - 2):
	>>> from gensim.summarization import keywords
	>>> text = """spaCy is an open-source software library for advanced natural language processing,
	written in the programming languages Python and Cython. The library is published under the MIT license
	and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
	>>> print(keywords(text))
	language
	languages
	software
	company