This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed library | |
import spacy | |
from sklearn import svm | |
#Loading NLP dictionary from Spacy | |
nlp = spacy.load("en_core_web_md") | |
#Build a class for categories | |
class Category: | |
BOOKS = "BOOKS" | |
CANDY = "CANDY" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed libraries | |
import re | |
#Original text | |
text = "The sidebar includes a Cheatsheet, full Reference, and Help. You can also save & Share with the Community, and view patterns you create or favorite in My Patterns." | |
#Compile regular expression pattern | |
pattern = re.compile(r'[A-Z]\w+') | |
#Scan the text and find all matches | |
results = re.findall(pattern,text) | |
#Print the results list | |
print(results) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed libraries | |
import re | |
#The text we want to search | |
text = "Yesterday at the office party, I met the manager of the east coast branch, her phone number is 202–555–0180. I also exchange my number 202–555–0195 with a recruiter." | |
#Compile the pattern | |
pattern = re.compile(r'\d{3}–\d{3}–\d{4}') | |
#Find all matches | |
results = re.findall(pattern,text) | |
#Print matches list | |
print(results) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed library | |
import re | |
#The original text | |
text = "Python 3.0 was released on 03–12–2008. It was a major revision of the language that is not completely backward-compatible. Many of its major features were backported to Python 2.6. and 2.7 versions that were released on 03.10.2008 and 03/07/2010 respectively." | |
#Compile pattern | |
pattern = re.compile(r'(0[1-9]|[12][0-9]|3[01])(\.|–|/)(0[1-9]|1[012])(\.|–|/)(19|20\d\d)') | |
#Find all matches | |
results = re.findall(pattern,text) | |
#Combine different groups | |
dates = [''.join(results[i]) for i in range(len(results))] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed libraries | |
import pandas as pd | |
#Read data and save it in a dataframe | |
Df = pd.read_csv("Rdatasets/csv/datasets/Titanic.csv") | |
Df.describe() #display description of data | |
#Inspect the first 5 rows of the dataframe | |
first = Df.head(5) | |
#Inspect the last 5 rows of the dataframe | |
last = Df.tail(5) | |
#Take a sample of 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed libraries | |
from scipy.stats import chi2_contingency | |
#The chi2_contingence calculates the dependance of variables in a contingency table. | |
#The function returns the p value, the degree of freedom, the test statistics, and the expected frequencies. | |
#The Chi-Squared Test | |
table = [[10, 20, 30],[6, 9, 17]] | |
stat, p, dof, expected = chi2_contingency(table) | |
print('The p value is %.3f' % (p)) | |
#the validity of this calculation is that the test should be used only if | |
#the observed and expected frequencies in each cell are at least 5 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed librabried | |
from scipy.stats import ttest_ind | |
#the ttest_ind calculates the T-test for the means of TWO INDEPENDENT set of values. | |
data1 = [1.94, 2.29, -0.04, 0.84, -0.28, 1.34, 0.35, 0.12, -0.63, 0.11] | |
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169] | |
stat, p = ttest_ind(data1, data2) | |
print('The p value is %.3f' % (p)) | |
#the validity of this calculation is that the test should be used only if | |
#the observed and expected frequencies in each cell are at least 5 | |
if p > 0.05: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed libraries | |
from scipy.stats import f_oneway | |
#Performs one way ANOVA | |
#Same distribution data to test | |
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869] | |
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169] | |
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204] | |
stat, p = f_oneway(data1, data2, data3) | |
print('The p value is %.3f' % (p)) | |
if p > 0.05: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed libraries | |
import numpy as np | |
import matplotlib.pyplot as plt | |
#Observed values | |
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) | |
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12]) | |
# number of points | |
n = np.size(x) | |
# mean of x and y vector | |
m_x, m_y = np.mean(x), np.mean(y) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import needed libraries | |
import pygal | |
import pandas as pd | |
#Parse the dataframe | |
data = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv") | |
#Get the mean number of cases per states | |
mean_per_state = data.groupby('state')['cases'].mean() | |
#Draw the bar chart | |
barChart = pygal.Bar(height=400) | |
[barChart.add(x[0], x[1]) for x in mean_per_state.items()] |