This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python3 code to demonstrate | |
# convert dictionary string to dictionary | |
# using json.loads() | |
import json | |
# initializing string | |
string = '{"Kiprono" : 67, "Bob" : 76, "Alice" : 88}' | |
# printing original string | |
print(string) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python3 code to demonstrate | |
# convert dictionary string to dictionary | |
# using json.loads() | |
import ast | |
# initializing string | |
string = '{"Kiprono" : 67, "Bob" : 76, "Alice" : 88}' | |
# printing original string | |
print(string) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
# generate 15000 intergeter data points randomly between -1000 and 1000 | |
col1= np.random.randint(-1000,1000,15000) | |
#dictionary of data | |
data = { | |
"col1" : col1, | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Q1 : Determine the average age per occupation. | |
df.groupby('occupation').age.mean() | |
#-----------------OUTPUT------------------------ | |
# occupation | |
# other 43 | |
# technician 24 | |
# writer 23 | |
# Name: age, dtype: int64 | |
#-----------------END OF OUTPUT----------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import necessary packages | |
from bs4 import BeautifulSoup | |
import requests | |
import pandas as pd | |
import re | |
# Site URL | |
url="https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita" | |
# Make a GET request to fetch the raw HTML content | |
html_content = requests.get(url).text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# On site there are 3 tables with the class "wikitable" | |
# The following line will generate a list of HTML content for each table | |
gdp = soup.find_all("table", attrs={"class": "wikitable"}) | |
print("Number of tables on site: ",len(gdp)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Lets go ahead and scrape first table with HTML code gdp[0] | |
table1 = gdp[0] | |
# the head will form our column names | |
body = table1.find_all("tr") | |
# Head values (Column names) are the first items of the body list | |
head = body[0] # 0th item is the header row | |
body_rows = body[1:] # All other items becomes the rest of the rows | |
# Lets now iterate through the head HTML code and make list of clean headings |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Next is now to loop though the rest of the rows | |
#print(body_rows[0]) | |
all_rows = [] # will be a list for list for all rows | |
for row_num in range(len(body_rows)): # A row at a time | |
row = [] # this will old entries for one row | |
for row_item in body_rows[row_num].find_all("td"): #loop through all row entries | |
# row_item.text removes the tags from the entries | |
# the following regex is to remove \xa0 and \n and comma from row_item.text | |
# xa0 encodes the flag, \n is the newline and comma separates thousands in numbers |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We can now use the data on all_rowsa and headings to make a table | |
# all_rows becomes our data and headings the column names | |
df = pd.DataFrame(data=all_rows,columns=headings) | |
df.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# import library to use to vectorize | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
# Vectorization on sklearn - simple example | |
corpus = [ | |
"Excellent Services by the ABC remit team.Recommend.", | |
"Bad Services. Transaction delayed for three days.Don't recommend."] | |
vectorizer = CountVectorizer() | |
X = vectorizer.fit_transform(corpus) | |
#print(X) #this is just a matrix with position as tuple and token in that position |
OlderNewer