Sara Metwalli SaraM92

## NLP.py
#Import needed library
import spacy
from sklearn import svm
#Loading NLP dictionary from Spacy
nlp = spacy.load("en_core_web_md")

#Build a class for categories
class Category:
    BOOKS = "BOOKS"
    CANDY = "CANDY"

## Regex_match_word.py
#Import needed libraries
import re
#Original text
text = "The sidebar includes a Cheatsheet, full Reference, and Help. You can also save & Share with the Community, and view patterns you create or favorite in My Patterns."
#Compile regular expression pattern
pattern = re.compile(r'[A-Z]\w+')
#Scan the text and find all matches
results = re.findall(pattern,text)
#Print the results list
print(results)

## Regex_match_phoneno.py
#Import needed libraries
import re
#The text we want to search
text = "Yesterday at the office party, I met the manager of the east coast branch, her phone number is 202–555–0180. I also exchange my number 202–555–0195 with a recruiter."
#Compile the pattern
pattern = re.compile(r'\d{3}–\d{3}–\d{4}')
#Find all matches
results = re.findall(pattern,text)
#Print matches list
print(results)

## Regex_match_date.py
#Import needed library
import re
#The original text
text = "Python 3.0 was released on 03–12–2008. It was a major revision of the language that is not completely backward-compatible. Many of its major features were backported to Python 2.6. and 2.7 versions that were released on 03.10.2008 and 03/07/2010 respectively."
#Compile pattern
pattern = re.compile(r'(0[1-9]|[12][0-9]|3[01])(\.|–|/)(0[1-9]|1[012])(\.|–|/)(19|20\d\d)')
#Find all matches
results = re.findall(pattern,text)
#Combine different groups
dates = [''.join(results[i]) for i in range(len(results))]

## EDA.py
#Import needed libraries
import pandas as pd
#Read data and save it in a dataframe
Df = pd.read_csv("Rdatasets/csv/datasets/Titanic.csv")
Df.describe() #display description of data
#Inspect the first 5 rows of the dataframe
first = Df.head(5)
#Inspect the last 5 rows of the dataframe
last = Df.tail(5)
#Take a sample of 5

## chiSquareTest.py
#Import needed libraries
from scipy.stats import chi2_contingency
#The chi2_contingence calculates the dependance of variables in a contingency table.
#The function returns the p value, the degree of freedom, the test statistics, and the expected frequencies.
#The Chi-Squared Test
table = [[10, 20, 30],[6,  9,  17]]
stat, p, dof, expected = chi2_contingency(table)
print('The p value is %.3f' % (p))
#the validity of this calculation is that the test should be used only if
#the observed and expected frequencies in each cell are at least 5

## tTest.py
#Import needed librabried
from scipy.stats import ttest_ind
#the ttest_ind calculates the T-test for the means of TWO INDEPENDENT set of values.
data1 = [1.94, 2.29, -0.04, 0.84, -0.28, 1.34, 0.35, 0.12, -0.63, 0.11]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = ttest_ind(data1, data2)
print('The p value is %.3f' % (p))
#the validity of this calculation is that the test should be used only if
#the observed and expected frequencies in each cell are at least 5
if p > 0.05:

## anova.py
#Import needed libraries
from scipy.stats import f_oneway
#Performs one way ANOVA
#Same distribution data to test
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]
stat, p = f_oneway(data1, data2, data3)
print('The p value is  %.3f' % (p))
if p > 0.05:

## LinearReg.py
#Import needed libraries
import numpy as np
import matplotlib.pyplot as plt
#Observed values
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
# number of points
n = np.size(x)
# mean of x and y vector
m_x, m_y = np.mean(x), np.mean(y)

## pygal_barChart.py
#Import needed libraries
import pygal
import pandas as pd
#Parse the dataframe
data = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
#Get the mean number of cases per states
mean_per_state = data.groupby('state')['cases'].mean()
#Draw the bar chart
barChart = pygal.Bar(height=400)
[barChart.add(x[0], x[1]) for x in mean_per_state.items()]
	#Import needed library
	import spacy
	from sklearn import svm
	#Loading NLP dictionary from Spacy
	nlp = spacy.load("en_core_web_md")

	#Build a class for categories
	class Category:
	BOOKS = "BOOKS"
	CANDY = "CANDY"
	#Import needed libraries
	import re
	#Original text
	text = "The sidebar includes a Cheatsheet, full Reference, and Help. You can also save & Share with the Community, and view patterns you create or favorite in My Patterns."
	#Compile regular expression pattern
	pattern = re.compile(r'[A-Z]\w+')
	#Scan the text and find all matches
	results = re.findall(pattern,text)
	#Print the results list
	print(results)
	#Import needed libraries
	import re
	#The text we want to search
	text = "Yesterday at the office party, I met the manager of the east coast branch, her phone number is 202–555–0180. I also exchange my number 202–555–0195 with a recruiter."
	#Compile the pattern
	pattern = re.compile(r'\d{3}–\d{3}–\d{4}')
	#Find all matches
	results = re.findall(pattern,text)
	#Print matches list
	print(results)
	#Import needed library
	import re
	#The original text
	text = "Python 3.0 was released on 03–12–2008. It was a major revision of the language that is not completely backward-compatible. Many of its major features were backported to Python 2.6. and 2.7 versions that were released on 03.10.2008 and 03/07/2010 respectively."
	#Compile pattern
	pattern = re.compile(r'(0[1-9]\|[12][0-9]\|3[01])(\.\|–\|/)(0[1-9]\|1[012])(\.\|–\|/)(19\|20\d\d)')
	#Find all matches
	results = re.findall(pattern,text)
	#Combine different groups
	dates = [''.join(results[i]) for i in range(len(results))]
	#Import needed libraries
	import pandas as pd
	#Read data and save it in a dataframe
	Df = pd.read_csv("Rdatasets/csv/datasets/Titanic.csv")
	Df.describe() #display description of data
	#Inspect the first 5 rows of the dataframe
	first = Df.head(5)
	#Inspect the last 5 rows of the dataframe
	last = Df.tail(5)
	#Take a sample of 5
	#Import needed libraries
	from scipy.stats import chi2_contingency
	#The chi2_contingence calculates the dependance of variables in a contingency table.
	#The function returns the p value, the degree of freedom, the test statistics, and the expected frequencies.
	#The Chi-Squared Test
	table = [[10, 20, 30],[6, 9, 17]]
	stat, p, dof, expected = chi2_contingency(table)
	print('The p value is %.3f' % (p))
	#the validity of this calculation is that the test should be used only if
	#the observed and expected frequencies in each cell are at least 5
	#Import needed librabried
	from scipy.stats import ttest_ind
	#the ttest_ind calculates the T-test for the means of TWO INDEPENDENT set of values.
	data1 = [1.94, 2.29, -0.04, 0.84, -0.28, 1.34, 0.35, 0.12, -0.63, 0.11]
	data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
	stat, p = ttest_ind(data1, data2)
	print('The p value is %.3f' % (p))
	#the validity of this calculation is that the test should be used only if
	#the observed and expected frequencies in each cell are at least 5
	if p > 0.05:
	#Import needed libraries
	from scipy.stats import f_oneway
	#Performs one way ANOVA
	#Same distribution data to test
	data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
	data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
	data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]
	stat, p = f_oneway(data1, data2, data3)
	print('The p value is %.3f' % (p))
	if p > 0.05:
	#Import needed libraries
	import numpy as np
	import matplotlib.pyplot as plt
	#Observed values
	x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
	y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
	# number of points
	n = np.size(x)
	# mean of x and y vector
	m_x, m_y = np.mean(x), np.mean(y)
	#Import needed libraries
	import pygal
	import pandas as pd
	#Parse the dataframe
	data = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
	#Get the mean number of cases per states
	mean_per_state = data.groupby('state')['cases'].mean()
	#Draw the bar chart
	barChart = pygal.Bar(height=400)
	[barChart.add(x[0], x[1]) for x in mean_per_state.items()]