# height and weight are available as a regular lists
# Import numpy
import numpy as np
# Calculate the BMI: bmi
np_height_m = np.array(height_in) * 0.0254
np_weight_kg = np.array(weight_lb) * 0.453592
bmi = np_weight_kg / np_height_m ** 2
# Create the light array
light = bmi < 21
# Print out light
print(light)
# Print out BMIs of all baseball players whose BMI is below 21
print(bmi[light])
# numpy and matplotlib imported, seed set
# Simulate random walk 500 times
all_walks = []
for i in range(500) :
random_walk = [0]
for x in range(100) :
step = random_walk[-1]
dice = np.random.randint(1,7)
if dice <= 2:
step = max(0, step - 1)
elif dice <= 5:
step = step + 1
else:
step = step + np.random.randint(1,7)
if np.random.rand() <= 0.001 :
step = 0
random_walk.append(step)
all_walks.append(random_walk)
# Create and plot np_aw_t
np_aw_t = np.transpose(np.array(all_walks))
# Select last row from np_aw_t: ends
ends = np_aw_t[-1]
# Plot histogram of ends, display plot
plt.hist(ends)
plt.show()
# calculate the chance that this end point is greater than or equal to 60
In [1]: len(ends[ends >=60]) / 500
Out[1]: 0.784
# Import pandas
import pandas as pd
# Import Twitter data as DataFrame: df
df = pd.read_csv("tweets.csv")
# Initialize an empty dictionary: langs_count
langs_count = {}
# Extract column from DataFrame: col
col = df['lang']
# Iterate over lang column in DataFrame
for entry in col:
# If the language is in langs_count, add 1
if entry in langs_count.keys():
langs_count[entry] = langs_count[entry] +1
# Else add the language to langs_count, set the value to 1
else:
langs_count[entry] = 1
# Print the populated dictionary
print(langs_count)
{'et': 1, 'en': 97, 'und': 2}
# Define count_entries()
def count_entries(df, col_name):
"""Return a dictionary with counts of
occurrences as value for each key."""
# Initialize an empty dictionary: langs_count
langs_count = {}
# Extract column from DataFrame: col
col = df[col_name]
# Iterate over lang column in DataFrame
for entry in col:
# If the language is in langs_count, add 1
if entry in langs_count.keys():
langs_count[entry] = langs_count[entry] + 1
# Else add the language to langs_count, set the value to 1
else:
langs_count[entry] = 1
# Return the langs_count dictionary
return langs_count
# Call count_entries(): result
result = count_entries(tweets_df, 'lang')
# Print the result
print(result)
{'et': 1, 'en': 97, 'und': 2}
# Define count_entries()
def count_entries(df, col_name='lang'):
"""Return a dictionary with counts of
occurrences as value for each key."""
# Initialize an empty dictionary: cols_count
cols_count = {}
# Extract column from DataFrame: col
col = df[col_name]
# Iterate over the column in DataFrame
for entry in col:
# If entry is in cols_count, add 1
if entry in cols_count.keys():
cols_count[entry] += 1
# Else add the entry to cols_count, set the value to 1
else:
cols_count[entry] = 1
# Return the cols_count dictionary
return cols_count
# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')
# Call count_entries(): result2
result2 = count_entries(tweets_df, 'source')
# Print result1 and result2
print(result1)
print(result2)
{'et': 1, 'en': 97, 'und': 2}
{'<a href="http://ru....
# Define count_entries()
def count_entries(df, *args):
"""Return a dictionary with counts of
occurrences as value for each key."""
#Initialize an empty dictionary: cols_count
cols_count = {}
# Iterate over column names in args
for col_name in args:
# Extract column from DataFrame: col
col = df[col_name]
# Iterate over the column in DataFrame
for entry in col:
# If entry is in cols_count, add 1
if entry in cols_count.keys():
cols_count[entry] += 1
# Else add the entry to cols_count, set the value to 1
else:
cols_count[entry] = 1
# Return the cols_count dictionary
return cols_count
# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')
# Call count_entries(): result2
result2 = count_entries(tweets_df, 'lang', 'source')
# Print result1 and result2
print(result1)
print(result2)
{'et': 1, 'en': 97, 'und': 2}
{'<a href="http://rutracker.o....
# Select retweets from the Twitter DataFrame: result
result = filter(lambda x: x[0:2] =="RT", tweets_df['text'])
# Create list from filter object result: res_list
res_list = list(result)
# Print all retweets in res_list
for tweet in res_list:
print(tweet)
RT @bpolitics: .@krollbondrating's Christopher Whalen says Clinton is the weakest Dem candidate in 50 years https://t.co/pLk7rvoRSn https:/…
RT @HeidiAlpine: @dmartosko Cruz video found.....racing from the scene.... #cruzsexscandal https://t.co/zuAPZfQDk3
# Define count_entries()
def count_entries(df, col_name='lang'):
"""Return a dictionary with counts of
occurrences as value for each key."""
# Initialize an empty dictionary: cols_count
cols_count = {}
# Add try block
try:
# Extract column from DataFrame: col
col = df[col_name]
# Iterate over the column in dataframe
for entry in col:
# If entry is in cols_count, add 1
if entry in cols_count.keys():
cols_count[entry] += 1
# Else add the entry to cols_count, set the value to 1
else:
cols_count[entry] = 1
# Return the cols_count dictionary
return cols_count
# Add except block
except:
print('The DataFrame does not have a ' + col_name + ' column.')
# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')
# Print result1
print(result1)
{'et': 1, 'en': 97, 'und': 2}
# Define count_entries()
def count_entries(df, col_name='lang'):
"""Return a dictionary with counts of
occurrences as value for each key."""
# Raise a ValueError if col_name is NOT in DataFrame
if col_name not in df.columns:
raise ValueError('The DataFrame does not have a ' + col_name + ' column.')
# Initialize an empty dictionary: cols_count
cols_count = {}
# Extract column from DataFrame: col
col = df[col_name]
# Iterate over the column in DataFrame
for entry in col:
# If entry is in cols_count, add 1
if entry in cols_count.keys():
cols_count[entry] += 1
# Else add the entry to cols_count, set the value to 1
else:
cols_count[entry] = 1
# Return the cols_count dictionary
return cols_count
# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')
# Print result1
print(result1)
{'et': 1, 'en': 97, 'und': 2}
# Define count_entries()
def count_entries(csv_file, c_size, colname):
"""Return a dictionary with counts of
occurrences as value for each key."""
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Iterate over the file chunk by chunk
for chunk in pd.read_csv(csv_file,chunksize=c_size):
# Iterate over the column in DataFrame
for entry in chunk[colname]:
if entry in counts_dict.keys():
counts_dict[entry] += 1
else:
counts_dict[entry] = 1
# Return counts_dict
return counts_dict
# Call count_entries(): result_counts
result_counts = count_entries("tweets.csv", 10, "lang")
# Print result_counts
print(result_counts)
# Create a list of tuples: mutant_data
mutant_data = list(zip(mutants, aliases, powers))
# Print the list of tuples
print(mutant_data)
# Create a zip object using the three lists: mutant_zip
mutant_zip = zip(mutants, aliases, powers)
# Print the zip object
print(mutant_zip)
# Unpack the zip object and print the tuple values
for value1, value2, value3 in list(mutant_zip):
print(value1, value2, value3)
# Create a zip object from mutants and powers: z1
z1 = zip(mutants, powers)
# Print the tuples in z1 by unpacking with *
print(*z1)
# Re-create a zip object from mutants and powers: z1
z1 = zip(mutants, powers)
# 'Unzip' the tuples in z1 by unpacking with * and zip(): result1, result2
result1, result2 = zip(*z1)
# Check if unpacked tuples are equivalent to original tuples
print(result1 == mutants)
print(result2 == powers)
# Create an iterator for range(3): small_value
small_value = iter(range(3))
# Print the values in small_value
print(next(small_value))
print(next(small_value))
print(next(small_value))
# Loop over range(3) and print the values
for num in range(3):
print(num)
# Create an iterator for range(10 ** 100): googol
googol = iter(range(10 **100))
# Print the first 5 values from googol
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))
# Create a 5 x 5 matrix using a list of lists: matrix
matrix = [[col for col in range(5)] for row in range(5)]
# Print the matrix
for row in matrix:
print(row)
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']
# Create list comprehension: new_fellowship
new_fellowship = [member for member in fellowship if len(member)>= 7]
# Print the new list
print(new_fellowship)
['samwise', 'aragorn', 'legolas', 'boromir']
# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']
# Create list comprehension: new_fellowship
new_fellowship = [member if len(member)>=7 else "" for member in fellowship]
# Print the new list
print(new_fellowship)
['', 'samwise', '', 'aragorn', 'legolas', 'boromir', '']
In [1]: fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']
# Create dict comprehension: new_fellowship
new_fellowship = {member: len(member) for member in fellowship}
# Print the new list
print(new_fellowship)
{'samwise': 7, 'boromir': 7, 'aragorn': 7, 'merry': 5, 'frodo': 5, 'gimli': 5, 'legolas': 7}
Recall that generator expressions basically have the same syntax as list comprehensions, except that it uses parentheses () instead of brackets []; this should make things feel familiar! Furthermore, if you have ever iterated over a dictionary with .items(), or used the range() function, for example, you have already encountered and used generators before, without knowing it! When you use these functions, Python creates generators for you behind the scenes.
In [4]: # Create generator object: result
result = (num for num in range(0,10))
# Print the first 5 values
print(next(result))
print(next(result))
print(next(result))
print(next(result))
print(next(result))
# Print the rest of the values
for value in result:
print(value)
0
1
2
3
4
5
6
7
8
9
In [1]: # Create a list of strings
lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']
# Define generator function get_lengths
def get_lengths(input_list):
"""Generator function that yields the
length of the strings in input_list."""
# Yield the length of a string
for person in input_list:
yield(len(person))
# Print the values generated by get_lengths()
for value in get_lengths(lannister):
print(value)
6
5
5
6
7
In [1]: # Define lists2dict()
def lists2dict(list1, list2):
"""Return a dictionary where list1 provides
the keys and list2 provides the values."""
# Zip lists: zipped_lists
zipped_lists = zip(list1, list2)
# Create a dictionary: rs_dict
rs_dict = dict(zipped_lists)
# Return the dictionary
return rs_dict
# Call lists2dict: rs_fxn
rs_fxn = lists2dict(feature_names, row_vals)
# Print rs_fxn
print(rs_fxn)
{'Value': '133.56090740552298', 'IndicatorCode': 'SP.ADO.TFRT', 'CountryName': 'Arab World', 'CountryCode': 'ARB', 'Year': '1960', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)'}
In [1]: # Print the first two lists in row_lists
print(row_lists[0])
print(row_lists[1])
# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]
# Print the first two dictionaries in list_of_dicts
print(list_of_dicts[0])
print(list_of_dicts[1])
['Arab World', 'ARB', 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'SP.ADO.TFRT', '1960', '133.56090740552298']
['Arab World', 'ARB', 'Age dependency ratio (% of working-age population)', 'SP.POP.DPND', '1960', '87.7976011532547']
{'Value': '133.56090740552298', 'IndicatorCode': 'SP.ADO.TFRT', 'CountryName': 'Arab World', 'CountryCode': 'ARB', 'Year': '1960', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)'}
{'Value': '87.7976011532547', 'IndicatorCode': 'SP.POP.DPND', 'CountryName': 'Arab World', 'CountryCode': 'ARB', 'Year': '1960', 'IndicatorName': 'Age dependency ratio (% of working-age population)'}
In [1]: # Import the pandas package
import pandas as pd
# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]
# Turn list of dicts into a DataFrame: df
df = pd.DataFrame(list_of_dicts)
# Print the head of the DataFrame
print(df.head())
CountryCode CountryName IndicatorCode \
0 ARB Arab World SP.ADO.TFRT
1 ARB Arab World SP.POP.DPND
2 ARB Arab World SP.POP.DPND.OL
3 ARB Arab World SP.POP.DPND.YG
4 ARB Arab World MS.MIL.XPRT.KD
IndicatorName Value Year
0 Adolescent fertility rate (births per 1,000 wo... 133.56090740552298 1960
1 Age dependency ratio (% of working-age populat... 87.7976011532547 1960
2 Age dependency ratio, old (% of working-age po... 6.634579191565161 1960
3 Age dependency ratio, young (% of working-age ... 81.02332950839141 1960
4 Arms exports (SIPRI trend indicator values) 3000000.0 1960
In [1]: # Open a connection to the file
with open('world_dev_ind.csv') as file:
# Skip the column names
file.readline()
# Initialize an empty dictionary: counts_dict
counts_dict = {}
# Process only the first 1000 rows
for j in range(0, 1000):
# Split the current line into a list: line
line = file.readline().split(',')
# Get the value for the first column: first_col
first_col = line[0]
# If the column value is in the dict, increment its value
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
# Else, add to the dict and set value to 1
else:
counts_dict[first_col] = 1
# Print the resulting dictionary
print(counts_dict)
{'Arab World': 80, 'Heavily indebted poor countries (HIPC)': 18, 'European Union': 116, 'East Asia & Pacific (developing only)': 123, 'Fragile and conflict affected situations': 76, 'Euro area': 119, 'Central Europe and the Baltics': 71, 'Europe & Central Asia (developing only)': 89, 'East Asia & Pacific (all income levels)': 122, 'Europe & Central Asia (all income levels)': 109, 'Caribbean small states': 77}
In [1]: # Define read_large_file()
def read_large_file(file_object):
"""A generator function to read a large file lazily."""
# Loop indefinitely until the end of the file
while True:
# Read a line from the file: data
data = file_object.readline()
# Break if this is the end of the file
if not data:
break
# Yield the line of data
yield data
# Open a connection to the file
with open('world_dev_ind.csv') as file:
# Create a generator object for the file: gen_file
gen_file = read_large_file(file)
# Print the first three lines of the file
print(next(gen_file))
print(next(gen_file))
print(next(gen_file))
CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
Arab World,ARB,"Adolescent fertility rate (births per 1,000 women ages 15-19)",SP.ADO.TFRT,1960,133.56090740552298
Arab World,ARB,Age dependency ratio (% of working-age population),SP.POP.DPND,1960,87.7976011532547
In [1]: # Initialize an empty dictionary: counts_dict
counts_dict = {}
# Open a connection to the file
with open("world_dev_ind.csv") as file:
# Iterate over the generator from read_large_file()
for line in read_large_file(file):
row = line.split(',')
first_col = row[0]
if first_col in counts_dict.keys():
counts_dict[first_col] += 1
else:
counts_dict[first_col] = 1
# Print
print(counts_dict)
{'North America': 123, 'Least developed countries: UN classification': 78, 'CountryName': 1, 'Euro area': 119, 'East Asia & Pacific (all income levels)': 122, 'High income': 131, 'Middle income': 138, 'Arab World': 80, 'Low income': 80, 'High income: nonOECD': 68, 'Central Europe and the Baltics': 71, 'Other small states': 63, 'Europe & Central Asia (all income levels)': 109, 'European Union': 116, 'Small states': 69, 'Fragile and conflict affected situations': 76, 'Lower middle income': 126, 'Heavily indebted poor countries (HIPC)': 99, 'Latin America & Caribbean (all income levels)': 130, 'Low & middle income': 138, 'High income: OECD': 127, 'Middle East & North Africa (developing only)': 94, 'South Asia': 36, 'East Asia & Pacific (developing only)': 123, 'Latin America & Caribbean (developing only)': 133, 'Pacific island small states': 66, 'Middle East & North Africa (all income levels)': 89, 'Europe & Central Asia (developing only)': 89, 'OECD members': 130, 'Caribbean small states': 77}
# Define plot_pop()
def plot_pop(filename, country_code):
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv(filename, chunksize=1000)
# Initialize empty DataFrame: data
data = pd.DataFrame()
# Iterate over each DataFrame chunk
for df_urb_pop in urb_pop_reader:
# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
# Zip DataFrame columns of interest: pops
pops = zip(df_pop_ceb['Total Population'],
df_pop_ceb['Urban population (% of total)'])
# Turn zip object into list: pops_list
pops_list = list(pops)
# Use list comprehension to create new DataFrame column 'Total Urban Population'
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
# Append DataFrame chunk to data: data
data = data.append(df_pop_ceb)
# Plot urban population data
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()
# Set the filename: fn
fn = 'ind_pop_data.csv'
# Call plot_pop for country code 'CEB'
plot_pop(fn, "CEB")
# Call plot_pop for country code 'ARB'
plot_pop(fn, "ARB")
# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
# Assign filename: file
file = 'titanic_corrupt.txt'
# Import file: data
data = pd.read_csv(file, sep="\t", comment='#', na_values='Nothing')
# Print the head of the DataFrame
print(data.head())
# Plot 'Age' variable in a histogram
pd.DataFrame.hist(data[['Age']])
plt.xlabel('Age (years)')
plt.ylabel('count')
plt.show()
# Import packages
from sqlalchemy import create_engine
import pandas as pd
# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')
# Open engine connection: con
con = engine.connect()
# Perform query: rs
rs = con.execute("select * from album")
# Save results of the query to DataFrame: df
df = pd.DataFrame(rs.fetchall())
# Close connection
con.close()
import pandas as pd
from sqlalchemy import create_engine
# Open engine in context manager
# Perform query and save results to DataFrame: df
with engine.connect() as con:
rs = con.execute("select LastName, title from employee")
df = pd.DataFrame(rs.fetchmany(size=3))
df.columns = rs.keys()
# Print the length of the DataFrame df
print(len(df))
# Print the head of the DataFrame df
print(df.head())
# Import packages
from sqlalchemy import create_engine
import pandas as pd
# Create engine: engine
engine = create_engine("sqlite:///Chinook.sqlite")
# Execute query and store records in DataFrame: df
df = pd.read_sql_query("select * from album", engine)
# Print head of DataFrame
print(df.head())
# Open engine in context manager and store query result in df1
with engine.connect() as con:
rs = con.execute("SELECT * FROM Album")
df1 = pd.DataFrame(rs.fetchall())
df1.columns = rs.keys()
# Confirm that both methods yield the same result
print(df.equals(df1))
# Import packages
import matplotlib.pyplot as plt
import pandas as pd
# Assign url of file: url
url = "https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv"
# Read file into a DataFrame: df
df = pd.read_csv(url, sep=";")
# Print the head of the DataFrame
print(df.head())
# Plot first column of df
pd.DataFrame.hist(df.ix[:, 0:1])
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('count')
plt.show()
# Import package
import pandas as pd
# Assign url of file: url
url = "http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls"
# Read in all sheets of Excel file: xl
xl = pd.read_excel(url, sheetname=None)
# Print the sheetnames to the shell
print(xl.keys())
# Print the head of the first sheet (using its name, NOT its index)
print(xl['1700'].head())
# Import packages
import requests
from bs4 import BeautifulSoup
# Specify url: url
url = 'https://www.python.org/~guido/'
# Package the request, send the request and catch the response: r
r = requests.get(url)
# Extract the response as html: html_doc
html_doc = r.text
# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)
# Get the title of Guido's webpage: guido_title
guido_title = soup.title
# Print the title of Guido's webpage to the shell
print(guido_title)
# Get Guido's text: guido_text
guido_text = soup.get_text()
# Print Guido's text to the shell
print(guido_text)
# Import package
import tweepy
# Store OAuth authentication credentials in relevant variables
access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"
# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# Initialize Stream listener
l = MyStreamListener()
# Create your Stream object with authentication
stream = tweepy.Stream(auth, l)
# Filter Twitter Streams to capture data by the keywords:
stream.filter(track=['clinton', 'trump','sanders','cruz'])
# Import package
import json
# String of path to file: tweets_data_path
tweets_data_path = "tweets.txt"
# Initialize empty list to store tweets: tweets_data
tweets_data = []
# Open connection to file
tweets_file = open(tweets_data_path, "r")
# Read in tweets and store in list: tweets_data
for line in tweets_file:
tweet = json.loads(line)
tweets_data.append(tweet)
# Close connection to file
tweets_file.close()
# Print the keys of the first tweet dict
print(tweets_data[0].keys())
# Import package
import pandas as pd
# Build DataFrame of tweet texts and languages
df = pd.DataFrame(tweets_data, columns=['text','lang'])
# Print head of DataFrame
print(df.head())
# Initialize list to store tweet counts
[clinton, trump, sanders, cruz] = [0, 0, 0, 0]
# Iterate through df, counting the number of tweets in which
# each candidate is mentioned
for index, row in df.iterrows():
clinton += word_in_text('clinton', row['text'])
trump += word_in_text('trump', row['text'])
sanders += word_in_text('sanders', row['text'])
cruz += word_in_text('cruz', row['text'])
import seaborn as sns
import matplotlib.pyplot as plt
# Set seaborn style
sns.set(color_codes=True)
# Create a list of labels:cd
cd = ['clinton', 'trump', 'sanders', 'cruz']
# Plot histogram
ax = sns.barplot(cd, [clinton, trump, sanders, cruz])
ax.set(ylabel="count")
plt.show()
In [1]: # Define echo_word as a lambda function: echo_word
echo_word = (lambda word1, echo: word1 * echo)
# Call echo_word: result
result = echo_word("hey", 5)
# Print result
print(result)
heyheyheyheyhey
# map() applies a function over an object, such as a list
In [1]: # Create a list of strings: spells
spells = ["protego", "accio", "expecto patronum", "legilimens"]
# Use map() to apply a lambda function over spells: shout_spells
shout_spells = map(lambda item: item + "!!!", spells)
# Convert shout_spells to a list: shout_spells_list
shout_spells_list = list(shout_spells)
# Convert shout_spells into a list and print it
print(shout_spells_list)
['protego!!!', 'accio!!!', 'expecto patronum!!!', 'legilimens!!!']
# The function filter() offers a way to filter out elements from a list that don't satisfy certain criteria.
In [1]: # Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'pippin', 'aragorn', 'boromir', 'legolas', 'gimli', 'gandalf']
# Use filter() to apply a lambda function over fellowship: result
result = filter(lambda member: len(member) >6, fellowship)
# Convert result to a list: result_list
result_list = list(result)
# Convert result into a list and print it
print(result_list)
['samwise', 'aragorn', 'boromir', 'legolas', 'gandalf']
# The reduce() function is useful for performing some computation on a list and, unlike map() and filter(), returns a single value as a result. To use reduce(), you must import it from the functools module.
In [1]: # Import reduce from functools
from functools import reduce
# Create a list of strings: stark
stark = ['robb', 'sansa', 'arya', 'brandon', 'rickon']
# Use reduce() to apply a lambda function over stark: result
result = reduce(lambda item1, item2: item1+item2, stark)
# Print the result
print(result)
robbsansaaryabrandonrickon
from __future__ import print_function
import sys
from operator import add
from pyspark.sql import SparkSession
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: wordcount <file>", file=sys.stderr)
sys.exit(-1)
spark = SparkSession\
.builder\
.appName("PythonWordCount")\
.getOrCreate()
lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
counts = lines.flatMap(lambda x: x.split(' ')) \
.map(lambda x: (x, 1)) \
.reduceByKey(add)
output = counts.collect()
for (word, count) in output:
print("%s: %i" % (word, count))
spark.stop()
pd.set_option('display.expand_frame_repr', False)
In [2]: pd.set_option('display.expand_frame_repr', False)
In [3]: airquality.head()
Out[3]:
Ozone Solar.R Wind Temp Month Day
0 41.0 190.0 7.4 67 5 1
1 36.0 118.0 8.0 72 5 2
2 12.0 149.0 12.6 74 5 3
3 18.0 313.0 11.5 62 5 4
4 NaN NaN 14.3 56 5 5
In [4]: airquality.tail()
Out[4]:
Ozone Solar.R Wind Temp Month Day
148 30.0 193.0 6.9 70 9 26
149 NaN 145.0 13.2 77 9 27
150 14.0 191.0 14.3 75 9 28
151 18.0 131.0 8.0 76 9 29
152 20.0 223.0 11.5 68 9 30
In [5]: airquality.describe()
Out[5]:
Ozone Solar.R Wind Temp Month Day
count 116.000000 146.000000 153.000000 153.000000 153.000000 153.000000
mean 42.129310 185.931507 9.957516 77.882353 6.993464 15.803922
std 32.987885 90.058422 3.523001 9.465270 1.416522 8.864520
min 1.000000 7.000000 1.700000 56.000000 5.000000 1.000000
25% 18.000000 115.750000 7.400000 72.000000 6.000000 8.000000
50% 31.500000 205.000000 9.700000 79.000000 7.000000 16.000000
75% 63.250000 258.750000 11.500000 85.000000 8.000000 23.000000
max 168.000000 334.000000 20.700000 97.000000 9.000000 31.000000
In [6]: airquality.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
Ozone 116 non-null float64
Solar.R 146 non-null float64
Wind 153 non-null float64
Temp 153 non-null int64
Month 153 non-null int64
Day 153 non-null int64
dtypes: float64(3), int64(3)
memory usage: 7.2 KB
In [8]: airquality.shape
Out[8]: (153, 6)
In [4]: # Print the head of airquality
print(airquality.head())
# Melt airquality: airquality_melt
airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day'], var_name="measurement", value_name="reading")
# Print the head of airquality_melt
print(airquality_melt.head())
Ozone Solar.R Wind Temp Month Day
0 41.0 190.0 7.4 67 5 1
1 36.0 118.0 8.0 72 5 2
2 12.0 149.0 12.6 74 5 3
3 18.0 313.0 11.5 62 5 4
4 NaN NaN 14.3 56 5 5
Month Day measurement reading
0 5 1 Ozone 41.0
1 5 2 Ozone 36.0
2 5 3 Ozone 12.0
3 5 4 Ozone 18.0
4 5 5 Ozone NaN
In [13]: # Print the head of airquality_melt
print(airquality_melt.head())
# Pivot airquality_melt: airquality_pivot
airquality_pivot = pd.pivot_table(airquality_melt, index=["Month", "Day"], columns="measurement", values="reading")
print("-------------")
# Print the head of airquality_pivot
print(airquality_pivot.head())
Month Day measurement reading
0 5 1 Ozone 41.0
1 5 2 Ozone 36.0
2 5 3 Ozone 12.0
3 5 4 Ozone 18.0
4 5 5 Ozone NaN
-------------
measurement Ozone Solar.R Temp Wind
Month Day
5 1 41.0 190.0 67.0 7.4
2 36.0 118.0 72.0 8.0
3 12.0 149.0 74.0 12.6
4 18.0 313.0 62.0 11.5
5 NaN NaN 56.0 14.3
In [14]:
In [4]: # Print the index of airquality_pivot
print(airquality_pivot.index)
print("-------------")
# Reset the index of airquality_pivot: airquality_pivot_reset
airquality_pivot_reset = airquality_pivot.reset_index()
# Print the new index of airquality_pivot_reset
print(airquality_pivot_reset.index)
print("-------------")
# Print the head of airquality_pivot_reset
print(airquality_pivot_reset.head())
MultiIndex(levels=[[5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]],
names=['Month', 'Day'])
-------------
RangeIndex(start=0, stop=153, step=1)
-------------
measurement Month Day Ozone Solar.R Temp Wind
0 5 1 41.0 190.0 67.0 7.4
1 5 2 36.0 118.0 72.0 8.0
2 5 3 12.0 149.0 74.0 12.6
3 5 4 18.0 313.0 62.0 11.5
4 5 5 NaN NaN 56.0 14.3
In [2]: tb.head()
Out[2]:
country year m014 m1524 m2534 m3544 m4554 m5564 m65 mu f014 f1524 f2534 f3544 f4554 f5564 f65 fu
0 AD 2000 0.0 0.0 1.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 AE 2000 2.0 4.0 4.0 6.0 5.0 12.0 10.0 NaN 3.0 16.0 1.0 3.0 0.0 0.0 4.0 NaN
2 AF 2000 52.0 228.0 183.0 149.0 129.0 94.0 80.0 NaN 93.0 414.0 565.0 339.0 205.0 99.0 36.0 NaN
3 AG 2000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 NaN 1.0 1.0 1.0 0.0 0.0 0.0 0.0 NaN
4 AL 2000 2.0 19.0 21.0 14.0 24.0 19.0 16.0 NaN 3.0 11.0 10.0 8.0 8.0 5.0 11.0 NaN
In [3]: # Melt tb: tb_melt
tb_melt = pd.melt(tb, id_vars=["country","year"])
# Create the 'gender' column
tb_melt['gender'] = tb_melt.variable.str[0]
# Create the 'age_group' column
tb_melt['age_group'] = tb_melt.variable.str[1:]
# Print the head of tb_melt
print(tb_melt.head())
country year variable value gender age_group
0 AD 2000 m014 0.0 m 014
1 AE 2000 m014 2.0 m 014
2 AF 2000 m014 52.0 m 014
3 AG 2000 m014 0.0 m 014
4 AL 2000 m014 2.0 m 014
In [2]: ebola.head()
Out[2]:
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone Cases_Nigeria Cases_Senegal Cases_UnitedStates Cases_Spain Cases_Mali Deaths_Guinea Deaths_Liberia Deaths_SierraLeone Deaths_Nigeria Deaths_Senegal Deaths_UnitedStates Deaths_Spain Deaths_Mali
0 1/5/2015 289 2776.0 NaN 10030.0 NaN NaN NaN NaN NaN 1786.0 NaN 2977.0 NaN NaN NaN NaN NaN
1 1/4/2015 288 2775.0 NaN 9780.0 NaN NaN NaN NaN NaN 1781.0 NaN 2943.0 NaN NaN NaN NaN NaN
2 1/3/2015 287 2769.0 8166.0 9722.0 NaN NaN NaN NaN NaN 1767.0 3496.0 2915.0 NaN NaN NaN NaN NaN
3 1/2/2015 286 NaN 8157.0 NaN NaN NaN NaN NaN NaN NaN 3496.0 NaN NaN NaN NaN NaN NaN
4 12/31/2014 284 2730.0 8115.0 9633.0 NaN NaN NaN NaN NaN 1739.0 3471.0 2827.0 NaN NaN NaN NaN NaN
In [8]: # Melt ebola: ebola_melt
ebola_melt = pd.melt(ebola, id_vars=["Date", "Day"], var_name="type_country", value_name="counts")
# Create the 'str_split' column
ebola_melt['str_split'] = ebola_melt['type_country'].str.split("_")
# Create the 'type' column
ebola_melt['type'] = ebola_melt['str_split'].str.get(0)
# Create the 'country' column
ebola_melt['country'] = ebola_melt['str_split'].str.get(1)
# Print the head of ebola_melt
print(ebola_melt.head())
Date Day type_country counts str_split type country
0 1/5/2015 289 Cases_Guinea 2776.0 [Cases, Guinea] Cases Guinea
1 1/4/2015 288 Cases_Guinea 2775.0 [Cases, Guinea] Cases Guinea
2 1/3/2015 287 Cases_Guinea 2769.0 [Cases, Guinea] Cases Guinea
3 1/2/2015 286 Cases_Guinea NaN [Cases, Guinea] Cases Guinea
4 12/31/2014 284 Cases_Guinea 2730.0 [Cases, Guinea] Cases Guinea
# Import necessary modules
import pandas as pd
import glob
# Write the pattern: pattern
pattern = '*.csv'
# Save all file matches: csv_files
csv_files = glob.glob(pattern)
# Print the file names
print(csv_files)
# Load the second file into a DataFrame: csv2
csv2 = pd.read_csv(csv_files[1])
# Print the head of csv2
print(csv2.head())
# Iterate over csv_files
for csv in csv_files:
# Read csv into a DataFrame: df
df = pd.read_csv(csv)
# Append df to frames
frames.append(df)
# Concatenate frames into a single DataFrame: uber
uber = pd.concat(frames)
# Print the shape of uber
print(uber.shape)
# Print the head of uber
print(uber.head())
In [7]: site
Out[7]:
name lat long
0 DR-1 -49.85 -128.57
1 DR-3 -47.15 -126.72
2 MSK-4 -48.87 -123.40
In [8]: visited
Out[8]:
ident site dated
0 619 DR-1 1927-02-08
1 622 DR-1 1927-02-10
2 734 DR-3 1939-01-07
3 735 DR-3 1930-01-12
4 751 DR-3 1930-02-26
5 752 DR-3 NaN
6 837 MSK-4 1932-01-14
7 844 DR-1 1932-03-22
In [9]: survey
Out[9]:
taken person quant reading
0 619 dyer rad 9.82
1 619 dyer sal 0.13
2 622 dyer rad 7.80
3 622 dyer sal 0.09
4 734 pb rad 8.41
5 734 lake sal 0.05
6 734 pb temp -21.50
7 735 pb rad 7.22
8 735 NaN sal 0.06
9 735 NaN temp -26.00
10 751 pb rad 4.35
11 751 pb temp -18.50
12 751 lake sal 0.10
13 752 lake rad 2.19
14 752 lake sal 0.09
15 752 lake temp -16.00
16 752 roe sal 41.60
17 837 lake rad 1.46
18 837 lake sal 0.21
19 837 roe sal 22.50
20 844 roe rad 11.25
In [6]: # Merge site and visited: m2m
m2m = pd.merge(left=site, right=visited, left_on='name', right_on='site')
# Merge m2m and survey: m2m
m2m = pd.merge(left=m2m, right=survey, left_on="ident", right_on="taken")
# Print the first 20 lines of m2m
print(m2m.head(20))
name lat long ident site dated taken person quant reading
0 DR-1 -49.85 -128.57 619 DR-1 1927-02-08 619 dyer rad 9.82
1 DR-1 -49.85 -128.57 619 DR-1 1927-02-08 619 dyer sal 0.13
2 DR-1 -49.85 -128.57 622 DR-1 1927-02-10 622 dyer rad 7.80
3 DR-1 -49.85 -128.57 622 DR-1 1927-02-10 622 dyer sal 0.09
4 DR-1 -49.85 -128.57 844 DR-1 1932-03-22 844 roe rad 11.25
5 DR-3 -47.15 -126.72 734 DR-3 1939-01-07 734 pb rad 8.41
6 DR-3 -47.15 -126.72 734 DR-3 1939-01-07 734 lake sal 0.05
7 DR-3 -47.15 -126.72 734 DR-3 1939-01-07 734 pb temp -21.50
8 DR-3 -47.15 -126.72 735 DR-3 1930-01-12 735 pb rad 7.22
9 DR-3 -47.15 -126.72 735 DR-3 1930-01-12 735 NaN sal 0.06
10 DR-3 -47.15 -126.72 735 DR-3 1930-01-12 735 NaN temp -26.00
11 DR-3 -47.15 -126.72 751 DR-3 1930-02-26 751 pb rad 4.35
12 DR-3 -47.15 -126.72 751 DR-3 1930-02-26 751 pb temp -18.50
13 DR-3 -47.15 -126.72 751 DR-3 1930-02-26 751 lake sal 0.10
14 DR-3 -47.15 -126.72 752 DR-3 NaN 752 lake rad 2.19
15 DR-3 -47.15 -126.72 752 DR-3 NaN 752 lake sal 0.09
16 DR-3 -47.15 -126.72 752 DR-3 NaN 752 lake temp -16.00
17 DR-3 -47.15 -126.72 752 DR-3 NaN 752 roe sal 41.60
18 MSK-4 -48.87 -123.40 837 MSK-4 1932-01-14 837 lake rad 1.46
19 MSK-4 -48.87 -123.40 837 MSK-4 1932-01-14 837 lake sal 0.21
In [2]: # Define recode_gender()
def recode_gender(gender):
# Return 0 if gender is 'Female'
if gender == 'Male':
return 1
# Return 1 if gender is 'Male'
elif gender == 'Female':
return 0
# Return np.nan
else:
return np.nan
# Apply the function to the sex column
tips['recode'] = tips.sex.apply(recode_gender)
# Print the first five rows of tips
print(tips.head())
total_bill tip sex smoker day time size recode
0 16.99 1.01 Female No Sun Dinner 2.0 0.0
1 10.34 1.66 Male No Sun Dinner 3.0 1.0
2 21.01 3.50 Male No Sun Dinner 3.0 1.0
3 23.68 3.31 Male No Sun Dinner 2.0 1.0
4 24.59 3.61 Female No Sun Dinner 4.0 0.0
In [3]: tips.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
total_bill 202 non-null float64
tip 220 non-null float64
sex 234 non-null object
smoker 229 non-null object
day 243 non-null object
time 227 non-null object
size 231 non-null float64
recode 234 non-null float64
dtypes: float64(4), object(4)
memory usage: 15.3+ KB
In [6]: g1800s.head()
Out[6]:
Life expectancy ... 1899
0 Abkhazia ... NaN
1 Afghanistan ... 27.20
2 Akrotiri and Dhekelia ... NaN
3 Albania ... 35.40
4 Algeria ... 28.82
[5 rows x 101 columns]
In [3]: g1800s.describe()
Out[3]:
1800 ... 1899
count 201.000000 ... 201.000000
mean 31.486020 ... 32.962985
std 3.763585 ... 6.325265
min 23.390000 ... 18.900000
25% 29.000000 ... 29.000000
50% 31.800000 ... 32.000000
75% 33.900000 ... 35.180000
max 42.850000 ... 51.620000
[8 rows x 100 columns]
In [4]: g1800s.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Columns: 101 entries, Life expectancy to 1899
dtypes: float64(100), object(1)
memory usage: 205.2+ KB
In [5]: g1800s.shape
Out[5]: (260, 101)
In [2]: def check_null_or_valid(row_data):
"""Function that takes a row of data,
drops all missing values,
and checks if all remaining values are greater than or equal to 0
"""
no_na = row_data.dropna()
numeric = pd.to_numeric(no_na)
ge0 = numeric >= 0
return ge0
# Check whether the first column is 'Life expectancy'
assert g1800s.columns[0] == 'Life expectancy'
# Check whether the values in the row are valid
assert g1800s.iloc[:, 1:].apply(check_null_or_valid, axis=1).all().all()
# Check that there is only one instance of each country
assert g1800s['Life expectancy'].value_counts()[0] == 1
In [1]: # Create the series of countries: countries
countries = gapminder['country']
# Drop all the duplicates from countries
countries = countries.drop_duplicates()
# Write the regular expression: pattern
pattern = '^[A-Za-z\.\s]*$'
# Create the Boolean vector: mask
mask = countries.str.contains(pattern)
# Invert the mask: mask_inverse
mask_inverse = ~mask
# Subset countries using mask_inverse: invalid_countries
invalid_countries = countries.loc[mask_inverse]
# Print invalid_countries
print(invalid_countries)
49 Congo, Dem. Rep.
50 Congo, Rep.
53 Cote d'Ivoire
73 Falkland Is (Malvinas)
93 Guinea-Bissau
98 Hong Kong, China
118 United Korea (former)\n
131 Macao, China
132 Macedonia, FYR
145 Micronesia, Fed. Sts.
161 Ngorno-Karabakh
187 St. Barthélemy
193 St.-Pierre-et-Miquelon
225 Timor-Leste
251 Virgin Islands (U.S.)
252 North Yemen (former)
253 South Yemen (former)
258 Åland
Name: country, dtype: object
In [1]: gapminder.shape
Out[1]: (169260, 3)
In [2]: # Assert that country does not contain any missing values
assert pd.notnull(gapminder.country).all()
# Assert that year does not contain any missing values
assert pd.notnull(gapminder.year).all()
# Drop the missing values
gapminder = gapminder.dropna()
# Print the shape of gapminder
print(gapminder.shape)
(43857, 3)
In [8]: # Add first subplot
plt.subplot(2, 1, 1)
# Create a histogram of life_expectancy
gapminder.life_expectancy.plot(kind='hist')
# Group gapminder: gapminder_agg
gapminder_agg = gapminder.groupby('year')['life_expectancy'].mean()
# Print the head of gapminder_agg
print(gapminder_agg.head())
# Print the tail of gapminder_agg
print(gapminder_agg.tail())
# Add second subplot
plt.subplot(2, 1, 2)
# Create a line plot of life expectancy per year
gapminder_agg.plot()
# Add title and specify axis labels
plt.title('Life expectancy over the years')
plt.ylabel('Life expectancy')
plt.xlabel('Year')
# Display the plots
plt.tight_layout()
plt.show()
# Save both DataFrames to csv files
gapminder.to_csv("gapminder.csv")
gapminder_agg.to_csv("gapminder_agg.csv")
year
1800 31.486020
1801 31.448905
1802 31.463483
1803 31.377413
1804 31.446318
Name: life_expectancy, dtype: float64
year
2012 71.663077
2013 71.916106
2014 72.088125
2015 72.321010
2016 72.556635
Name: life_expectancy, dtype: float64
<script.py> output:
year
1800 31.486020
1801 31.448905
1802 31.463483
1803 31.377413
1804 31.446318
Name: life_expectancy, dtype: float64
year
2012 71.663077
2013 71.916106
2014 72.088125
2015 72.321010
2016 72.556635
Name: life_expectancy, dtype: float64
In [24]: list_keys
Out[24]: ['Country', 'Total']
In [25]: list_values
Out[25]: [['United States', 'Soviet Union', 'United Kingdom'], [1118, 473, 273]]
# Zip the 2 lists together into one list of (key,value) tuples: zipped
zipped = list(zip(list_keys, list_values))
# Inspect the list using print()
print(zipped)
# Build a dictionary with the zipped list: data
data = dict(zipped)
# Build and inspect a DataFrame from the dictionary: df
df = pd.DataFrame(data)
print(df)
[('Country', ['United States', 'Soviet Union', 'United Kingdom']), ('Total', [1118, 473, 273])]
Country Total
0 United States 1118
1 Soviet Union 473
2 United Kingdom 273
# Create a list of y-axis column names: y_columns
y_columns = ["AAPL", "IBM"]
# Generate a line plot
df.plot(x="Month", y=y_columns)
# Generate a scatter plot
df.plot(kind="scatter", x='hp', y='mpg', s=sizes)
'''
While pandas can plot multiple columns of data in a single figure,
making plots that share the same x and y axes, there are cases where
two columns cannot be plotted together because their units do not match.
The .plot() method can generate subplots for each column being plotted.
Here, each plot will be scaled independently.
'''
# Make a list of the column names to be plotted: cols
cols = ["weight", "mpg"]
# Generate the box plots
df[cols].plot(kind="box", subplots=True)
# This formats the plots such that they appear on separate rows
fig, axes = plt.subplots(nrows=2, ncols=1)
# Plot the PDF
df.fraction.plot(ax=axes[0], kind='hist', bins=30, normed=True, range=(0,.3))
plt.show()
# Plot the CDF
df.fraction.plot(ax=axes[1], kind='hist', normed=True, cumulative=True, bins=30, range=(0,.3))
# Add the title
plt.title('Monthly stock prices')
# Add the y-axis label
plt.ylabel('Price ($US)')
# Display the plot
plt.show()
#TODO merge with previous content #TODO add value_count()
print(df.quantile([0.05, 0.95]))
# Reindex without fill method: ts3
ts3 = ts2.reindex(ts1.index)
# Reindex with fill method, using forward fill: ts4
ts4 = ts2.reindex(ts1.index, method="ffill")
# Combine ts1 + ts2: sum12
sum12 = ts1 +ts2
# Combine ts1 + ts3: sum13
sum13 = ts1 +ts3
# Combine ts1 + ts4: sum14
sum14 = ts1+ts4
In [2]: ts1.head()
Out[2]:
2016-07-01 0
2016-07-02 1
2016-07-03 2
2016-07-04 3
2016-07-05 4
dtype: int64
In [3]: ts2.head()
Out[3]:
2016-07-01 0
2016-07-04 1
2016-07-05 2
2016-07-06 3
2016-07-07 4
dtype: int64
In [4]: sum14.head()
Out[4]:
2016-07-01 0
2016-07-02 1
2016-07-03 2
2016-07-04 4
2016-07-05 6
dtype: int64
# Downsample to 6 hour data and aggregate by mean: df1
df1 = df.Temperature.resample("6H").mean()
# Downsample to daily data and count the number of data points: df2
df2 = df.Temperature.resample("D").count()
# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed
unsmoothed = df['Temperature']["2010-Aug-01":"2010-Aug-15"]
# Apply a rolling mean with a 24 hour window: smoothed
smoothed = unsmoothed.rolling(window=24).mean()
# Create a new DataFrame with columns smoothed and unsmoothed: august
august = pd.DataFrame({'smoothed':smoothed, 'unsmoothed': unsmoothed})
# Plot both smoothed and unsmoothed data using august.plot().
august.plot()
plt.show()
# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp
ts2_interp = ts2.reindex(ts1.index).interpolate(how="linear")
# Build a Boolean mask to filter out all the 'LAX' departure flights: mask
mask = df['Destination Airport'] == "LAX"
# Use the mask to subset the data: la
la = df[mask]
# Combine two columns of data to create a datetime series: times_tz_none
times_tz_none = pd.to_datetime( la['Date (MM/DD/YYYY)'] + " " + la['Wheels-off Time'] )
# Localize the time to US/Central: times_tz_central
times_tz_central = times_tz_none.dt.tz_localize("US/Central")
# Convert the datetimes from US/Central to US/Pacific
times_tz_pacific = times_tz_central.dt.tz_convert("US/Pacific")
TODO move to cleaning data
# Read in the data file with header=None: df
df = pd.read_csv(data_file, header=None)
# Remove the appropriate columns: df_dropped
df_dropped = df.drop(list_to_drop, axis="columns")
# Convert the date column to string: df_dropped['date']
df_dropped['date'] = df_dropped['date'].astype(str)
# Pad leading zeros to the Time column: df_dropped['Time']
df_dropped['Time'] = df_dropped['Time'].apply(lambda x:'{:0>4}'.format(x))
# Concatenate the new date and Time columns: date_string
date_string = df_dropped.date + df_dropped.Time
# Convert the date_string Series to datetime: date_times
date_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')
# Set the index to be the new date_times container: df_clean
df_clean = df_dropped.set_index(date_times)
# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']
df_clean['dry_bulb_faren'] = pd.to_numeric(df_clean['dry_bulb_faren'], errors="coerce")
# Convert the wind_speed and dew_point_faren columns to numeric values
df_clean['wind_speed'] = pd.to_numeric(df_clean.wind_speed, errors="coerce")
df_clean['dew_point_faren'] = pd.to_numeric(df_clean.dew_point_faren, errors="coerce")
# From previous steps
is_sky_clear = df_clean['sky_condition']=='CLR'
sunny = df_clean.loc[is_sky_clear]
sunny_daily_max = sunny.resample('D').max()
is_sky_overcast = df_clean['sky_condition'].str.contains('OVC')
overcast = df_clean.loc[is_sky_overcast]
overcast_daily_max = overcast.resample('D').max()
# Calculate the mean of sunny_daily_max
sunny_daily_max_mean = sunny_daily_max.mean()
# Calculate the mean of overcast_daily_max
overcast_daily_max_mean = overcast_daily_max.mean()
# Print the difference (sunny minus overcast)
print(sunny_daily_max_mean - overcast_daily_max_mean)
In [3]: election.head()
Out[3]:
state total Obama Romney winner voters
county
Adams PA 41973 35.482334 63.112001 Romney 61156
Allegheny PA 614671 56.640219 42.185820 Obama 924351
Armstrong PA 28322 30.696985 67.901278 Romney 42147
Beaver PA 80015 46.032619 52.637630 Romney 115157
Bedford PA 21444 22.057452 76.986570 Romney 32189
# Slice the row labels 'Perry' to 'Potter': p_counties
p_counties = election.loc['Perry':'Potter', :]
# Slice the row labels 'Potter' to 'Perry' in reverse order: p_counties_rev
p_counties_rev = election.loc['Potter': 'Perry': -1,:]
# Slice the columns from the starting column to 'Obama': left_columns
left_columns = election.loc[:, :'Obama']
# Slice the columns from 'Obama' to 'winner': middle_columns
middle_columns = election.loc[:, 'Obama':'winner']
# Slice the columns from 'Romney' to the end: 'right_columns'
right_columns = election.loc[:, 'Romney':]
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']
# Create the list of column labels: cols
cols = ['winner', 'Obama', 'Romney']
# Create the new DataFrame: three_counties
three_counties = election.loc[rows, cols]
# Create the boolean array: high_turnout
high_turnout = election['turnout'] > 70
# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]
# Create the boolean array: too_close
too_close = election['margin'] < 1
# Assign np.nan to the 'winner' column where the results were too close to call
election['winner'][too_close] = np.nan
# Select the 'age' and 'cabin' columns: df
df = titanic.loc[:, ['age','cabin']]
# Print the shape of df
print(df.shape)
# Drop rows in df with how='any' and print the shape
print(df.dropna(how='any').shape)
# Drop rows in df with how='all' and print the shape
print(df.dropna(how="all").shape)
# Drop columns in titanic with less than 1000 non-missing values
print(titanic.dropna(thresh=1000, axis='columns').info())
# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
return 5/9*(F - 32)
# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF', 'Mean Dew PointF']].apply(to_celsius)
# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'blue', \
'Romney':'red'}
# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election['winner'].map(red_vs_blue)
# Import zscore from scipy.stats
from scipy.stats import zscore
# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])
# Print the type of turnout_zscore
print(type(turnout_zscore))
# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore
In [1]: sales
Out[1]:
state month eggs salt spam
0 CA 1 47 12.0 17
1 CA 2 110 50.0 31
2 NY 1 221 89.0 72
3 NY 2 77 87.0 20
4 TX 1 132 NaN 52
5 TX 2 205 60.0 55
In [5]: # Set the index to be the columns ['state', 'month']: sales
sales = sales.set_index(['state', 'month'])
# Sort the MultiIndex: sales
sales = sales.sort_index()
# Print the sales DataFrame
print(sales)
eggs salt spam
state month
CA 1 47 12.0 17
2 110 50.0 31
NY 1 221 89.0 72
2 77 87.0 20
TX 1 132 NaN 52
2 205 60.0 55
In [23]: # Look up data for NY in month 1: NY_month1
In [25]: sales.loc[('NY', 1), :]
Out[25]:
eggs 221.0
salt 89.0
spam 72.0
Name: (NY, 1), dtype: float64
In [26]: # Look up data for CA and TX in month 2: CA_TX_month2
In [27]: sales.loc[(['CA','TX'], 2), :]
Out[27]:
eggs salt spam
state month
CA 2 110 50.0 31
TX 2 205 60.0 55
In [28]: # Look up data for all states in month 2: all_month2
In [29]: sales.loc[(slice(None), 2), :]
Out[29]:
eggs salt spam
state month
CA 2 110 50.0 31
NY 2 77 87.0 20
TX 2 205 60.0 55
In [6]: users
Out[6]:
visitors signups
city weekday
Austin Mon 326 3
Sun 139 7
Dallas Mon 456 5
Sun 237 12
In [7]: users.unstack(level='weekday')
Out[7]:
visitors signups
weekday Mon Sun Mon Sun
city
Austin 326 139 3 7
Dallas 456 237 5 12
In [8]: byweeekday = users.unstack(level='weekday')
In [9]: byweekday.stack(level='weekday')
Out[9]:
visitors signups
city weekday
Austin Mon 326 3
Sun 139 7
Dallas Mon 456 5
Sun
In [1]: visitors_by_city_weekday
Out[1]:
city Austin Dallas
weekday
Mon 326 456
Sun 139 237
# Reset the index: visitors_by_city_weekday
visitors_by_city_weekday = visitors_by_city_weekday.reset_index()
# Print visitors_by_city_weekday
print(visitors_by_city_weekday)
# Melt visitors_by_city_weekday: visitors
visitors = pd.melt(visitors_by_city_weekday, id_vars='weekday', value_name='visitors')
# Print visitors
print(visitors)
<script.py> output:
city weekday Austin Dallas
0 Mon 326 456
1 Sun 139 237
weekday city visitors
0 Mon Austin 326
1 Sun Austin 139
2 Mon Dallas 456
3 Sun Dallas 237
# Group titanic by 'pclass': by_class
by_class = titanic.groupby('pclass')
# Select 'age' and 'fare'
by_class_sub = by_class[['age','fare']]
# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(['max', 'median'])
# Print the maximum age in each class
print(aggregated.loc[:, ('age','max')])
# Print the median fare in each class
print(aggregated.loc[:, ('fare','median')])
# Read the CSV file into a DataFrame and sort the index: gapminder
gapminder = pd.read_csv('gapminder.csv', index_col=['Year','region','Country']).sort_index()
# Group gapminder by 'Year' and 'region': by_year_region
by_year_region = gapminder.groupby(level=['Year', 'region'])
# Define the function to compute spread: spread
def spread(series):
return series.max() - series.min()
# Create the dictionary: aggregator
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}
# Aggregate by_year_region using the dictionary: aggregated
aggregated = by_year_region.agg(aggregator)
# Print the last 6 entries of aggregated
print(aggregated.tail(6))
gdp child_mortality population
Year region
2013 America 49634.0 17.745833 9.629087e+08
East Asia & Pacific 134744.0 22.285714 2.244209e+09
Europe & Central Asia 86418.0 9.831875 8.968788e+08
Middle East & North Africa 128676.0 20.221500 4.030504e+08
South Asia 11469.0 46.287500 1.701241e+09
Sub-Saharan Africa 32035.0 76.944490 9.205996e+08
# GROUP BY DATETIME
# Read file: sales
sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)
# Create a groupby object: by_day
by_day = sales.groupby(sales.index.strftime('%a'))
# Create sum: units_sum
units_sum = by_day['Units'].sum()
# Print units_sum
print(units_sum)
<script.py> output:
Mon 48
Sat 7
Thu 59
Tue 13
Wed 48
Name: Units, dtype: int64
# Import zscore
from scipy.stats import zscore
# Group gapminder_2010: standardized
standardized = gapminder_2010.groupby('region')['life','fertility'].transform(zscore)
# Construct a Boolean Series to identify outliers: outliers
outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)
# Filter gapminder_2010 by the outliers: gm_outliers
gm_outliers = gapminder_2010.loc[outliers]
# Print gm_outliers
print(gm_outliers)
# Read the CSV file into a DataFrame: sales
sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)
# Group sales by 'Company': by_company
by_company = sales.groupby('Company')
# Compute the sum of the 'Units' of by_company: by_com_sum
by_com_sum = by_company['Units'].sum()
print(by_com_sum)
# Filter 'Units' where the sum is > 35: by_com_filt
by_com_filt = by_company.filter(lambda g:g['Units'].sum() > 35)
print(by_com_filt)
In [1]: medals.head()
Out[1]:
City Edition Sport Discipline Athlete NOC Gender Event Event_gender Medal
0 Athens 1896 Aquatics Swimming HAJOS, Alfred HUN Men 100m freestyle M Gold
1 Athens 1896 Aquatics Swimming HERSCHMANN, Otto AUT Men 100m freestyle M Silver
2 Athens 1896 Aquatics Swimming DRIVAS, Dimitrios GRE Men 100m freestyle for sailors M Bronze
3 Athens 1896 Aquatics Swimming MALOKINIS, Ioannis GRE Men 100m freestyle for sailors M Gold
4 Athens 1896 Aquatics Swimming CHASAPIS, Spiridon GRE Men 100m freestyle for sailors M Silver
In [2]: medals.loc[medals.NOC == 'USA'].groupby('Edition')['Medal'].count()
Out[2]:
Edition
1896 20
1900 55
1904 394
1908 63
1912 101
1920 193
1924 198
1928 84
1932 181
1936 92
1948 148
1952 130
1956 118
1960 112
1964 150
1968 149
1972 155
1976 155
1984 333
1988 193
1992 224
1996 260
2000 248
2004 264
2008 315
In [4]: # Select the 'NOC' column of medals: country_names
country_names = medals['NOC']
# Count the number of medals won by each country: medal_counts
medal_counts = country_names.value_counts()
# Print top 15 countries ranked by medals
print(medal_counts.head(15))
<script.py> output:
USA 4335
URS 2049
GBR 1594
FRA 1314
ITA 1228
In [4]: # Construct the pivot table: counted
counted = medals.pivot_table(index='NOC', columns='Medal', values='Athlete', aggfunc='count')
# Create the new column: counted['totals']
counted['totals'] = counted.sum(axis='columns')
# Sort counted by the 'totals' column
counted = counted.sort_values('totals', ascending=False)
# Print the top 15 rows of counted
print(counted.head(15))
Medal Bronze Gold Silver totals
NOC
USA 1052.0 2088.0 1195.0 4335.0
URS 584.0 838.0 627.0 2049.0
GBR 505.0 498.0 591.0 1594.0
FRA 475.0 378.0 461.0 1314.0
ITA 374.0 460.0 394.0 1228.0
In [1]: # Select columns: ev_gen
ev_gen = medals[['Event_gender', 'Gender']]
# Drop duplicate pairs: ev_gen_uniques
ev_gen_uniques = ev_gen.drop_duplicates()
# Print ev_gen_uniques
print(ev_gen_uniques)
Event_gender Gender
0 M Men
348 X Men
416 W Women
639 X Women
23675 W Men
In [13]: medals[(medals.Event_gender == 'W') & (medals.Gender == 'Men')]
Out[13]:
City Edition Sport Discipline Athlete NOC Gender Event Event_gender Medal
23675 Sydney 2000 Athletics Athletics CHEPCHUMBA, Joyce KEN Men marathon W Bronze
In [2]: # Group medals by 'NOC': country_grouped
country_grouped = medals.groupby('NOC')
# Compute the number of distinct sports in which each country won medals: Nsports
Nsports = country_grouped['Sport'].nunique()
# Sort the values of Nsports in descending order
Nsports = Nsports.sort_values(ascending=False)
# Print the top 15 rows of Nsports
print(Nsports.head(15))
NOC
USA 34
GBR 31
FRA 28
GER 26
CHN 24
In [3]: # Extract all rows for which the 'Edition' is between 1952 & 1988: during_cold_war
during_cold_war = (medals['Edition'] >= 1952) & (medals['Edition'] <= 1988)
# Extract rows for which 'NOC' is either 'USA' or 'URS': is_usa_urs
is_usa_urs = medals.NOC.isin(['USA','URS'])
# Use during_cold_war and is_usa_urs to create the DataFrame: cold_war_medals
cold_war_medals = medals.loc[during_cold_war & is_usa_urs]
# Group cold_war_medals by 'NOC'
country_grouped = cold_war_medals.groupby('NOC')
# Create Nsports
Nsports = country_grouped['Sport'].nunique().sort_values(ascending=False)
# Print Nsports
print(Nsports)
NOC
URS 21
USA 20
Name: Sport, dtype: int64
In [10]: medals.pivot_table(index='Edition', columns='NOC', values='Athlete', aggfunc='count').loc[1952:1988, ['USA','URS']]
Out[10]:
NOC USA URS
Edition
1952 130.0 117.0
1956 118.0 169.0
1960 112.0 169.0
1964 150.0 174.0
1968 149.0 188.0
1972 155.0 211.0
1976 155.0 285.0
1980 NaN 442.0
1984 333.0 NaN
1988 193.0 294.0
# Create the DataFrame: usa
usa = medals[medals['NOC'] == 'USA']
# Group usa by ['Edition', 'Medal'] and aggregate over 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'] )['Athlete'].count()
# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')
# Plot the DataFrame usa_medals_by_year
usa_medals_by_year.plot()
plt.show()
pd.set_option('display.expand_frame_repr', False)