Skip to content

Instantly share code, notes, and snippets.

@fabiog1901
Last active June 27, 2021 15:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save fabiog1901/ec5ff857d234e0ded2262ec4e0fb7f1f to your computer and use it in GitHub Desktop.
Save fabiog1901/ec5ff857d234e0ded2262ec4e0fb7f1f to your computer and use it in GitHub Desktop.

INTRO TO PYTHON FOR DATA SCIENCE

NUMPY

# height and weight are available as a regular lists

# Import numpy
import numpy as np

# Calculate the BMI: bmi
np_height_m = np.array(height_in) * 0.0254
np_weight_kg = np.array(weight_lb) * 0.453592
bmi = np_weight_kg / np_height_m ** 2

# Create the light array
light = bmi < 21

# Print out light
print(light)

# Print out BMIs of all baseball players whose BMI is below 21
print(bmi[light])



# numpy and matplotlib imported, seed set

# Simulate random walk 500 times
all_walks = []
for i in range(500) :
    random_walk = [0]
    for x in range(100) :
        step = random_walk[-1]
        dice = np.random.randint(1,7)
        if dice <= 2:
            step = max(0, step - 1)
        elif dice <= 5:
            step = step + 1
        else:
            step = step + np.random.randint(1,7)
        if np.random.rand() <= 0.001 :
            step = 0
        random_walk.append(step)
    all_walks.append(random_walk)

# Create and plot np_aw_t
np_aw_t = np.transpose(np.array(all_walks))

# Select last row from np_aw_t: ends
ends = np_aw_t[-1]

# Plot histogram of ends, display plot
plt.hist(ends)
plt.show()


# calculate the chance that this end point is greater than or equal to 60
In [1]: len(ends[ends >=60]) / 500
Out[1]: 0.784




# Import pandas
import pandas as pd

# Import Twitter data as DataFrame: df
df = pd.read_csv("tweets.csv")

# Initialize an empty dictionary: langs_count
langs_count = {}

# Extract column from DataFrame: col
col = df['lang']

# Iterate over lang column in DataFrame
for entry in col:

    # If the language is in langs_count, add 1
    if entry in langs_count.keys():
        langs_count[entry] = langs_count[entry] +1  
    # Else add the language to langs_count, set the value to 1
    else:
        
        langs_count[entry] = 1

# Print the populated dictionary
print(langs_count)

{'et': 1, 'en': 97, 'und': 2}




# Define count_entries()
def count_entries(df, col_name):
    """Return a dictionary with counts of 
    occurrences as value for each key."""

    # Initialize an empty dictionary: langs_count
    langs_count = {}
    
    # Extract column from DataFrame: col
    col = df[col_name]
    
    # Iterate over lang column in DataFrame
    for entry in col:

        # If the language is in langs_count, add 1
        if entry in langs_count.keys():
            langs_count[entry] = langs_count[entry] + 1
        # Else add the language to langs_count, set the value to 1
        else:
            langs_count[entry] = 1

    # Return the langs_count dictionary
    return langs_count
    
# Call count_entries(): result
result = count_entries(tweets_df, 'lang')

# Print the result
print(result)

{'et': 1, 'en': 97, 'und': 2}




# Define count_entries()
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""

    # Initialize an empty dictionary: cols_count
    cols_count = {}

    # Extract column from DataFrame: col
    col = df[col_name]
    
    # Iterate over the column in DataFrame
    for entry in col:

        # If entry is in cols_count, add 1
        if entry in cols_count.keys():
            cols_count[entry] += 1

        # Else add the entry to cols_count, set the value to 1
        else:
            cols_count[entry] = 1

    # Return the cols_count dictionary
    return cols_count

# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')

# Call count_entries(): result2
result2 = count_entries(tweets_df, 'source')

# Print result1 and result2
print(result1)
print(result2)

{'et': 1, 'en': 97, 'und': 2}
{'<a href="http://ru....







# Define count_entries()
def count_entries(df, *args):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    #Initialize an empty dictionary: cols_count
    cols_count = {}
    
    # Iterate over column names in args
    for col_name in args:
    
        # Extract column from DataFrame: col
        col = df[col_name]
    
        # Iterate over the column in DataFrame
        for entry in col:
    
            # If entry is in cols_count, add 1
            if entry in cols_count.keys():
                cols_count[entry] += 1
    
            # Else add the entry to cols_count, set the value to 1
            else:
                cols_count[entry] = 1

    # Return the cols_count dictionary
    return cols_count

# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')

# Call count_entries(): result2
result2 = count_entries(tweets_df, 'lang', 'source')

# Print result1 and result2
print(result1)
print(result2)

{'et': 1, 'en': 97, 'und': 2}
{'<a href="http://rutracker.o....


# Select retweets from the Twitter DataFrame: result
result = filter(lambda x: x[0:2] =="RT", tweets_df['text'])

# Create list from filter object result: res_list
res_list = list(result)

# Print all retweets in res_list
for tweet in res_list:
    print(tweet)
    
RT @bpolitics: .@krollbondrating's Christopher Whalen says Clinton is the weakest Dem candidate in 50 years https://t.co/pLk7rvoRSn https:/…
RT @HeidiAlpine: @dmartosko Cruz video found.....racing from the scene.... #cruzsexscandal https://t.co/zuAPZfQDk3





# Define count_entries()
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""

    # Initialize an empty dictionary: cols_count
    cols_count = {}

    # Add try block
    try:
        # Extract column from DataFrame: col
        col = df[col_name]
        
        # Iterate over the column in dataframe
        for entry in col:
    
            # If entry is in cols_count, add 1
            if entry in cols_count.keys():
                cols_count[entry] += 1
            # Else add the entry to cols_count, set the value to 1
            else:
                cols_count[entry] = 1
    
        # Return the cols_count dictionary
        return cols_count

    # Add except block
    except:
        print('The DataFrame does not have a ' + col_name + ' column.')

# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')

# Print result1
print(result1)

{'et': 1, 'en': 97, 'und': 2}






# Define count_entries()
def count_entries(df, col_name='lang'):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    # Raise a ValueError if col_name is NOT in DataFrame
    if col_name not in df.columns:
        raise ValueError('The DataFrame does not have a ' + col_name + ' column.')

    # Initialize an empty dictionary: cols_count
    cols_count = {}
    
    # Extract column from DataFrame: col
    col = df[col_name]
    
    # Iterate over the column in DataFrame
    for entry in col:

        # If entry is in cols_count, add 1
        if entry in cols_count.keys():
            cols_count[entry] += 1
            # Else add the entry to cols_count, set the value to 1
        else:
            cols_count[entry] = 1
        
        # Return the cols_count dictionary
    return cols_count

# Call count_entries(): result1
result1 = count_entries(tweets_df, 'lang')

# Print result1
print(result1)

{'et': 1, 'en': 97, 'und': 2}

Extracting information for large amounts of Twitter data

# Define count_entries()
def count_entries(csv_file, c_size, colname):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    # Initialize an empty dictionary: counts_dict
    counts_dict = {}

    # Iterate over the file chunk by chunk
    for chunk in pd.read_csv(csv_file,chunksize=c_size):

        # Iterate over the column in DataFrame
        for entry in chunk[colname]:
            if entry in counts_dict.keys():
                counts_dict[entry] += 1
            else:
                counts_dict[entry] = 1

    # Return counts_dict
    return counts_dict

# Call count_entries(): result_counts
result_counts = count_entries("tweets.csv", 10, "lang")

# Print result_counts
print(result_counts)




# Create a list of tuples: mutant_data
mutant_data = list(zip(mutants, aliases, powers))

# Print the list of tuples
print(mutant_data)

# Create a zip object using the three lists: mutant_zip
mutant_zip = zip(mutants, aliases, powers)

# Print the zip object
print(mutant_zip)

# Unpack the zip object and print the tuple values
for value1, value2, value3 in list(mutant_zip):
    print(value1, value2, value3)
    
    
    

# Create a zip object from mutants and powers: z1
z1 = zip(mutants, powers)

# Print the tuples in z1 by unpacking with *
print(*z1)

# Re-create a zip object from mutants and powers: z1
z1 = zip(mutants, powers)

# 'Unzip' the tuples in z1 by unpacking with * and zip(): result1, result2
result1, result2 = zip(*z1)

# Check if unpacked tuples are equivalent to original tuples
print(result1 == mutants)
print(result2 == powers)

Iterating over iterables (2)

# Create an iterator for range(3): small_value
small_value = iter(range(3))

# Print the values in small_value
print(next(small_value))
print(next(small_value))
print(next(small_value))

# Loop over range(3) and print the values
for num in range(3):
    print(num)


# Create an iterator for range(10 ** 100): googol
googol = iter(range(10 **100))

# Print the first 5 values from googol
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))
print(next(googol))

Using conditionals in comprehensions (1)

# Create a 5 x 5 matrix using a list of lists: matrix
        matrix = [[col for col in range(5)] for row in range(5)]
        
        # Print the matrix
        for row in matrix:
            print(row)
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]





# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create list comprehension: new_fellowship
new_fellowship = [member for member in fellowship if len(member)>= 7]

# Print the new list
print(new_fellowship)
['samwise', 'aragorn', 'legolas', 'boromir']

# Create a list of strings: fellowship
fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']

# Create list comprehension: new_fellowship
new_fellowship = [member if len(member)>=7 else "" for member in fellowship]

# Print the new list
print(new_fellowship)
['', 'samwise', '', 'aragorn', 'legolas', 'boromir', '']



In [1]: fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']
        
        # Create dict comprehension: new_fellowship
        new_fellowship = {member: len(member) for member in fellowship}
        
        # Print the new list
        print(new_fellowship)
{'samwise': 7, 'boromir': 7, 'aragorn': 7, 'merry': 5, 'frodo': 5, 'gimli': 5, 'legolas': 7}

Generator

Recall that generator expressions basically have the same syntax as list comprehensions, except that it uses parentheses () instead of brackets []; this should make things feel familiar! Furthermore, if you have ever iterated over a dictionary with .items(), or used the range() function, for example, you have already encountered and used generators before, without knowing it! When you use these functions, Python creates generators for you behind the scenes.

In [4]: # Create generator object: result
        result = (num for num in range(0,10))
        
        # Print the first 5 values
        print(next(result))
        print(next(result))
        print(next(result))
        print(next(result))
        print(next(result))
        
        
        # Print the rest of the values
        for value in result:
            print(value)
0
1
2
3
4
5
6
7
8
9

generator function

In [1]: # Create a list of strings
        lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']
        
        # Define generator function get_lengths
        def get_lengths(input_list):
            """Generator function that yields the
            length of the strings in input_list."""
        
            # Yield the length of a string
            for person in input_list:
                yield(len(person))
        
        # Print the values generated by get_lengths()
        for value in get_lengths(lannister):
            print(value)
6
5
5
6
7



In [1]: # Define lists2dict()
        def lists2dict(list1, list2):
            """Return a dictionary where list1 provides
            the keys and list2 provides the values."""
        
            # Zip lists: zipped_lists
            zipped_lists = zip(list1, list2)
        
            # Create a dictionary: rs_dict
            rs_dict = dict(zipped_lists)
        
            # Return the dictionary
            return rs_dict
        
        # Call lists2dict: rs_fxn
        rs_fxn = lists2dict(feature_names, row_vals)
        
        # Print rs_fxn
        print(rs_fxn)
{'Value': '133.56090740552298', 'IndicatorCode': 'SP.ADO.TFRT', 'CountryName': 'Arab World', 'CountryCode': 'ARB', 'Year': '1960', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)'}




In [1]: # Print the first two lists in row_lists
        print(row_lists[0])
        print(row_lists[1])
        
        # Turn list of lists into list of dicts: list_of_dicts
        list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]
        
        # Print the first two dictionaries in list_of_dicts
        print(list_of_dicts[0])
        print(list_of_dicts[1])
['Arab World', 'ARB', 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'SP.ADO.TFRT', '1960', '133.56090740552298']
['Arab World', 'ARB', 'Age dependency ratio (% of working-age population)', 'SP.POP.DPND', '1960', '87.7976011532547']
{'Value': '133.56090740552298', 'IndicatorCode': 'SP.ADO.TFRT', 'CountryName': 'Arab World', 'CountryCode': 'ARB', 'Year': '1960', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)'}
{'Value': '87.7976011532547', 'IndicatorCode': 'SP.POP.DPND', 'CountryName': 'Arab World', 'CountryCode': 'ARB', 'Year': '1960', 'IndicatorName': 'Age dependency ratio (% of working-age population)'}



In [1]: # Import the pandas package
        import pandas as pd
        
        # Turn list of lists into list of dicts: list_of_dicts
        list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]
        
        # Turn list of dicts into a DataFrame: df
        df = pd.DataFrame(list_of_dicts)
        
        # Print the head of the DataFrame
        print(df.head())
  CountryCode CountryName   IndicatorCode  \
0         ARB  Arab World     SP.ADO.TFRT   
1         ARB  Arab World     SP.POP.DPND   
2         ARB  Arab World  SP.POP.DPND.OL   
3         ARB  Arab World  SP.POP.DPND.YG   
4         ARB  Arab World  MS.MIL.XPRT.KD   

                                       IndicatorName               Value  Year  
0  Adolescent fertility rate (births per 1,000 wo...  133.56090740552298  1960  
1  Age dependency ratio (% of working-age populat...    87.7976011532547  1960  
2  Age dependency ratio, old (% of working-age po...   6.634579191565161  1960  
3  Age dependency ratio, young (% of working-age ...   81.02332950839141  1960  
4        Arms exports (SIPRI trend indicator values)           3000000.0  1960

Writing a generator to load data in chunks

In [1]: # Open a connection to the file
        with open('world_dev_ind.csv') as file:
        
            # Skip the column names
            file.readline()
        
            # Initialize an empty dictionary: counts_dict
            counts_dict = {}
        
            # Process only the first 1000 rows
            for j in range(0, 1000):
        
                # Split the current line into a list: line
                line = file.readline().split(',')
        
                # Get the value for the first column: first_col
                first_col = line[0]
        
                # If the column value is in the dict, increment its value
                if first_col in counts_dict.keys():
                    counts_dict[first_col] += 1
        
                # Else, add to the dict and set value to 1
                else:
                    counts_dict[first_col] = 1
        
        # Print the resulting dictionary
        print(counts_dict)
{'Arab World': 80, 'Heavily indebted poor countries (HIPC)': 18, 'European Union': 116, 'East Asia & Pacific (developing only)': 123, 'Fragile and conflict affected situations': 76, 'Euro area': 119, 'Central Europe and the Baltics': 71, 'Europe & Central Asia (developing only)': 89, 'East Asia & Pacific (all income levels)': 122, 'Europe & Central Asia (all income levels)': 109, 'Caribbean small states': 77}



In [1]: # Define read_large_file()
        def read_large_file(file_object):
            """A generator function to read a large file lazily."""
        
            # Loop indefinitely until the end of the file
            while True:
        
                # Read a line from the file: data
                data = file_object.readline()
        
                # Break if this is the end of the file
                if not data:
                    break
        
                # Yield the line of data
                yield data
                
        # Open a connection to the file
        with open('world_dev_ind.csv') as file:
        
            # Create a generator object for the file: gen_file
            gen_file = read_large_file(file)
        
            # Print the first three lines of the file
            print(next(gen_file))
            print(next(gen_file))
            print(next(gen_file))
CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value

Arab World,ARB,"Adolescent fertility rate (births per 1,000 women ages 15-19)",SP.ADO.TFRT,1960,133.56090740552298

Arab World,ARB,Age dependency ratio (% of working-age population),SP.POP.DPND,1960,87.7976011532547


In [1]: # Initialize an empty dictionary: counts_dict
        counts_dict = {}
        
        # Open a connection to the file
        with open("world_dev_ind.csv") as file:
        
            # Iterate over the generator from read_large_file()
            for line in read_large_file(file):
        
                row = line.split(',')
                first_col = row[0]
        
                if first_col in counts_dict.keys():
                    counts_dict[first_col] += 1
                else:
                    counts_dict[first_col] = 1
        
        # Print            
        print(counts_dict)
{'North America': 123, 'Least developed countries: UN classification': 78, 'CountryName': 1, 'Euro area': 119, 'East Asia & Pacific (all income levels)': 122, 'High income': 131, 'Middle income': 138, 'Arab World': 80, 'Low income': 80, 'High income: nonOECD': 68, 'Central Europe and the Baltics': 71, 'Other small states': 63, 'Europe & Central Asia (all income levels)': 109, 'European Union': 116, 'Small states': 69, 'Fragile and conflict affected situations': 76, 'Lower middle income': 126, 'Heavily indebted poor countries (HIPC)': 99, 'Latin America & Caribbean (all income levels)': 130, 'Low & middle income': 138, 'High income: OECD': 127, 'Middle East & North Africa (developing only)': 94, 'South Asia': 36, 'East Asia & Pacific (developing only)': 123, 'Latin America & Caribbean (developing only)': 133, 'Pacific island small states': 66, 'Middle East & North Africa (all income levels)': 89, 'Europe & Central Asia (developing only)': 89, 'OECD members': 130, 'Caribbean small states': 77}

Writing an iterator to load data in chunks (5)

# Define plot_pop()
def plot_pop(filename, country_code):

    # Initialize reader object: urb_pop_reader
    urb_pop_reader = pd.read_csv(filename, chunksize=1000)

    # Initialize empty DataFrame: data
    data = pd.DataFrame()
    
    # Iterate over each DataFrame chunk
    for df_urb_pop in urb_pop_reader:
        # Check out specific country: df_pop_ceb
        df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]

        # Zip DataFrame columns of interest: pops
        pops = zip(df_pop_ceb['Total Population'],
                    df_pop_ceb['Urban population (% of total)'])

        # Turn zip object into list: pops_list
        pops_list = list(pops)

        # Use list comprehension to create new DataFrame column 'Total Urban Population'
        df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
    
        # Append DataFrame chunk to data: data
        data = data.append(df_pop_ceb)

    # Plot urban population data
    data.plot(kind='scatter', x='Year', y='Total Urban Population')
    plt.show()

# Set the filename: fn
fn = 'ind_pop_data.csv'

# Call plot_pop for country code 'CEB'
plot_pop(fn, "CEB")

# Call plot_pop for country code 'ARB'
plot_pop(fn, "ARB")

PYTHON IMPORT

# Import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

# Assign filename: file
file = 'titanic_corrupt.txt'

# Import file: data
data = pd.read_csv(file, sep="\t", comment='#', na_values='Nothing')

# Print the head of the DataFrame
print(data.head())

# Plot 'Age' variable in a histogram
pd.DataFrame.hist(data[['Age']])
plt.xlabel('Age (years)')
plt.ylabel('count')
plt.show()




# Import packages
from sqlalchemy import create_engine
import pandas as pd

# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')

# Open engine connection: con
con = engine.connect()

# Perform query: rs
rs = con.execute("select * from album")

# Save results of the query to DataFrame: df
df = pd.DataFrame(rs.fetchall())

# Close connection
con.close()





import pandas as pd
from sqlalchemy import create_engine

# Open engine in context manager
# Perform query and save results to DataFrame: df

with engine.connect() as con:
    rs = con.execute("select LastName, title from employee")
    df = pd.DataFrame(rs.fetchmany(size=3))
    df.columns = rs.keys()

# Print the length of the DataFrame df
print(len(df))

# Print the head of the DataFrame df
print(df.head())

Pandas and The Hello World of SQL Queries!

# Import packages
from sqlalchemy import create_engine
import pandas as pd

# Create engine: engine
engine = create_engine("sqlite:///Chinook.sqlite")

# Execute query and store records in DataFrame: df
df = pd.read_sql_query("select * from album", engine)

# Print head of DataFrame
print(df.head())

# Open engine in context manager and store query result in df1
with engine.connect() as con:
    rs = con.execute("SELECT * FROM Album")
    df1 = pd.DataFrame(rs.fetchall())
    df1.columns = rs.keys()

# Confirm that both methods yield the same result
print(df.equals(df1))

Opening and reading flat files from the web

# Import packages
import matplotlib.pyplot as plt
import pandas as pd

# Assign url of file: url
url = "https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv"

# Read file into a DataFrame: df
df = pd.read_csv(url, sep=";")

# Print the head of the DataFrame
print(df.head())

# Plot first column of df
pd.DataFrame.hist(df.ix[:, 0:1])
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('count')
plt.show()


# Import package
import pandas as pd

# Assign url of file: url
url = "http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls"

# Read in all sheets of Excel file: xl
xl = pd.read_excel(url, sheetname=None)

# Print the sheetnames to the shell
print(xl.keys())

# Print the head of the first sheet (using its name, NOT its index)
print(xl['1700'].head())

Turning a webpage into data using BeautifulSoup: getting the hyperlinks

# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url: url
url = 'https://www.python.org/~guido/'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Extract the response as html: html_doc
html_doc = r.text

# Create a BeautifulSoup object from the HTML: soup
soup  = BeautifulSoup(html_doc)

# Get the title of Guido's webpage: guido_title
guido_title = soup.title

# Print the title of Guido's webpage to the shell
print(guido_title)

# Get Guido's text: guido_text
guido_text = soup.get_text()

# Print Guido's text to the shell
print(guido_text)

tweepy

# Import package
import tweepy

# Store OAuth authentication credentials in relevant variables
access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"

# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)





# Initialize Stream listener
l = MyStreamListener()

# Create your Stream object with authentication
stream = tweepy.Stream(auth, l)


# Filter Twitter Streams to capture data by the keywords:
stream.filter(track=['clinton', 'trump','sanders','cruz'])




# Import package
import json

# String of path to file: tweets_data_path
tweets_data_path = "tweets.txt"

# Initialize empty list to store tweets: tweets_data
tweets_data = []

# Open connection to file
tweets_file = open(tweets_data_path, "r")

# Read in tweets and store in list: tweets_data
for line in tweets_file:
    tweet = json.loads(line)
    tweets_data.append(tweet)

# Close connection to file
tweets_file.close()

# Print the keys of the first tweet dict
print(tweets_data[0].keys())






# Import package
import pandas as pd

# Build DataFrame of tweet texts and languages
df = pd.DataFrame(tweets_data, columns=['text','lang'])

# Print head of DataFrame
print(df.head())



# Initialize list to store tweet counts
[clinton, trump, sanders, cruz] = [0, 0, 0, 0]

# Iterate through df, counting the number of tweets in which
# each candidate is mentioned
for index, row in df.iterrows():
    clinton += word_in_text('clinton', row['text'])
    trump += word_in_text('trump', row['text'])
    sanders += word_in_text('sanders', row['text'])
    cruz += word_in_text('cruz', row['text'])





import seaborn as sns
import matplotlib.pyplot as plt

# Set seaborn style
sns.set(color_codes=True)

# Create a list of labels:cd
cd = ['clinton', 'trump', 'sanders', 'cruz']

# Plot histogram
ax = sns.barplot(cd, [clinton, trump, sanders, cruz])
ax.set(ylabel="count")
plt.show()

LAMBDA AND MAP FILTER REDUCE

In [1]: # Define echo_word as a lambda function: echo_word
        echo_word = (lambda word1, echo: word1 * echo)
        
        # Call echo_word: result
        result = echo_word("hey", 5)
        
        # Print result
        print(result)
heyheyheyheyhey



# map() applies a function over an object, such as a list
In [1]: # Create a list of strings: spells
        spells = ["protego", "accio", "expecto patronum", "legilimens"]
        
        # Use map() to apply a lambda function over spells: shout_spells
        shout_spells = map(lambda item: item + "!!!", spells)
        
        # Convert shout_spells to a list: shout_spells_list
        shout_spells_list = list(shout_spells)
        
        # Convert shout_spells into a list and print it
        print(shout_spells_list)
['protego!!!', 'accio!!!', 'expecto patronum!!!', 'legilimens!!!']


#  The function filter() offers a way to filter out elements from a list that don't satisfy certain criteria.
In [1]: # Create a list of strings: fellowship
        fellowship = ['frodo', 'samwise', 'merry', 'pippin', 'aragorn', 'boromir', 'legolas', 'gimli', 'gandalf']
        
        # Use filter() to apply a lambda function over fellowship: result
        result = filter(lambda member: len(member) >6, fellowship)
        
        # Convert result to a list: result_list
        result_list = list(result)
        
        # Convert result into a list and print it
        print(result_list)
['samwise', 'aragorn', 'boromir', 'legolas', 'gandalf']


#  The reduce() function is useful for performing some computation on a list and, unlike map() and filter(), returns a single value as a result. To use reduce(), you must import it from the functools module.
In [1]: # Import reduce from functools
        from functools import reduce
        
        # Create a list of strings: stark
        stark = ['robb', 'sansa', 'arya', 'brandon', 'rickon']
        
        # Use reduce() to apply a lambda function over stark: result
        result = reduce(lambda item1, item2: item1+item2, stark)
        
        # Print the result
        print(result)
robbsansaaryabrandonrickon

PYSPARK WORDCOUNT

from __future__ import print_function

import sys
from operator import add

from pyspark.sql import SparkSession


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: wordcount <file>", file=sys.stderr)
        sys.exit(-1)

    spark = SparkSession\
        .builder\
        .appName("PythonWordCount")\
        .getOrCreate()

    lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0])
    counts = lines.flatMap(lambda x: x.split(' ')) \
                  .map(lambda x: (x, 1)) \
                  .reduceByKey(add)
    output = counts.collect()
    for (word, count) in output:
        print("%s: %i" % (word, count))

    spark.stop()

CLEANING DATA IN PYTHON

pd.set_option('display.expand_frame_repr', False)

Describe DataFrame

In [2]: pd.set_option('display.expand_frame_repr', False)

In [3]: airquality.head()
Out[3]: 
   Ozone  Solar.R  Wind  Temp  Month  Day
0   41.0    190.0   7.4    67      5    1
1   36.0    118.0   8.0    72      5    2
2   12.0    149.0  12.6    74      5    3
3   18.0    313.0  11.5    62      5    4
4    NaN      NaN  14.3    56      5    5

In [4]: airquality.tail()
Out[4]: 
     Ozone  Solar.R  Wind  Temp  Month  Day
148   30.0    193.0   6.9    70      9   26
149    NaN    145.0  13.2    77      9   27
150   14.0    191.0  14.3    75      9   28
151   18.0    131.0   8.0    76      9   29
152   20.0    223.0  11.5    68      9   30

In [5]: airquality.describe()
Out[5]: 
            Ozone     Solar.R        Wind        Temp       Month         Day
count  116.000000  146.000000  153.000000  153.000000  153.000000  153.000000
mean    42.129310  185.931507    9.957516   77.882353    6.993464   15.803922
std     32.987885   90.058422    3.523001    9.465270    1.416522    8.864520
min      1.000000    7.000000    1.700000   56.000000    5.000000    1.000000
25%     18.000000  115.750000    7.400000   72.000000    6.000000    8.000000
50%     31.500000  205.000000    9.700000   79.000000    7.000000   16.000000
75%     63.250000  258.750000   11.500000   85.000000    8.000000   23.000000
max    168.000000  334.000000   20.700000   97.000000    9.000000   31.000000

In [6]: airquality.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
Ozone      116 non-null float64
Solar.R    146 non-null float64
Wind       153 non-null float64
Temp       153 non-null int64
Month      153 non-null int64
Day        153 non-null int64
dtypes: float64(3), int64(3)
memory usage: 7.2 KB

In [8]: airquality.shape
Out[8]: (153, 6)

Pandas melt and pivot_table

In [4]: # Print the head of airquality
        print(airquality.head())
        
        # Melt airquality: airquality_melt
        airquality_melt = pd.melt(airquality, id_vars=['Month', 'Day'], var_name="measurement", value_name="reading")
        
        # Print the head of airquality_melt
        print(airquality_melt.head())
   Ozone  Solar.R  Wind  Temp  Month  Day
0   41.0    190.0   7.4    67      5    1
1   36.0    118.0   8.0    72      5    2
2   12.0    149.0  12.6    74      5    3
3   18.0    313.0  11.5    62      5    4
4    NaN      NaN  14.3    56      5    5
   Month  Day measurement  reading
0      5    1       Ozone     41.0
1      5    2       Ozone     36.0
2      5    3       Ozone     12.0
3      5    4       Ozone     18.0
4      5    5       Ozone      NaN




In [13]: # Print the head of airquality_melt
         print(airquality_melt.head())
         
         # Pivot airquality_melt: airquality_pivot
         airquality_pivot = pd.pivot_table(airquality_melt, index=["Month", "Day"], columns="measurement", values="reading")
         
         print("-------------")
         # Print the head of airquality_pivot
         print(airquality_pivot.head())
   Month  Day measurement  reading
0      5    1       Ozone     41.0
1      5    2       Ozone     36.0
2      5    3       Ozone     12.0
3      5    4       Ozone     18.0
4      5    5       Ozone      NaN
-------------
measurement  Ozone  Solar.R  Temp  Wind
Month Day                              
5     1       41.0    190.0  67.0   7.4
      2       36.0    118.0  72.0   8.0
      3       12.0    149.0  74.0  12.6
      4       18.0    313.0  62.0  11.5
      5        NaN      NaN  56.0  14.3

In [14]: 


In [4]: # Print the index of airquality_pivot
        print(airquality_pivot.index)
        print("-------------")
        # Reset the index of airquality_pivot: airquality_pivot_reset
        airquality_pivot_reset = airquality_pivot.reset_index()
        
        # Print the new index of airquality_pivot_reset
        print(airquality_pivot_reset.index)
        print("-------------")
        # Print the head of airquality_pivot_reset
        print(airquality_pivot_reset.head())
MultiIndex(levels=[[5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]],
           names=['Month', 'Day'])
-------------
RangeIndex(start=0, stop=153, step=1)
-------------
measurement  Month  Day  Ozone  Solar.R  Temp  Wind
0                5    1   41.0    190.0  67.0   7.4
1                5    2   36.0    118.0  72.0   8.0
2                5    3   12.0    149.0  74.0  12.6
3                5    4   18.0    313.0  62.0  11.5
4                5    5    NaN      NaN  56.0  14.3





In [2]: tb.head()
Out[2]: 
  country  year  m014  m1524  m2534  m3544  m4554  m5564   m65  mu  f014  f1524  f2534  f3544  f4554  f5564   f65  fu
0      AD  2000   0.0    0.0    1.0    0.0    0.0    0.0   0.0 NaN   NaN    NaN    NaN    NaN    NaN    NaN   NaN NaN
1      AE  2000   2.0    4.0    4.0    6.0    5.0   12.0  10.0 NaN   3.0   16.0    1.0    3.0    0.0    0.0   4.0 NaN
2      AF  2000  52.0  228.0  183.0  149.0  129.0   94.0  80.0 NaN  93.0  414.0  565.0  339.0  205.0   99.0  36.0 NaN
3      AG  2000   0.0    0.0    0.0    0.0    0.0    0.0   1.0 NaN   1.0    1.0    1.0    0.0    0.0    0.0   0.0 NaN
4      AL  2000   2.0   19.0   21.0   14.0   24.0   19.0  16.0 NaN   3.0   11.0   10.0    8.0    8.0    5.0  11.0 NaN

In [3]: # Melt tb: tb_melt
        tb_melt = pd.melt(tb, id_vars=["country","year"])
        
        # Create the 'gender' column
        tb_melt['gender'] = tb_melt.variable.str[0]
        
        # Create the 'age_group' column
        tb_melt['age_group'] = tb_melt.variable.str[1:]
        
        # Print the head of tb_melt
        print(tb_melt.head())
  country  year variable  value gender age_group
0      AD  2000     m014    0.0      m       014
1      AE  2000     m014    2.0      m       014
2      AF  2000     m014   52.0      m       014
3      AG  2000     m014    0.0      m       014
4      AL  2000     m014    2.0      m       014



In [2]: ebola.head()
Out[2]: 
         Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone  Cases_Nigeria  Cases_Senegal  Cases_UnitedStates  Cases_Spain  Cases_Mali  Deaths_Guinea  Deaths_Liberia  Deaths_SierraLeone  Deaths_Nigeria  Deaths_Senegal  Deaths_UnitedStates  Deaths_Spain  Deaths_Mali
0    1/5/2015  289        2776.0            NaN            10030.0            NaN            NaN                 NaN          NaN         NaN         1786.0             NaN              2977.0             NaN             NaN                  NaN           NaN          NaN
1    1/4/2015  288        2775.0            NaN             9780.0            NaN            NaN                 NaN          NaN         NaN         1781.0             NaN              2943.0             NaN             NaN                  NaN           NaN          NaN
2    1/3/2015  287        2769.0         8166.0             9722.0            NaN            NaN                 NaN          NaN         NaN         1767.0          3496.0              2915.0             NaN             NaN                  NaN           NaN          NaN
3    1/2/2015  286           NaN         8157.0                NaN            NaN            NaN                 NaN          NaN         NaN            NaN          3496.0                 NaN             NaN             NaN                  NaN           NaN          NaN
4  12/31/2014  284        2730.0         8115.0             9633.0            NaN            NaN                 NaN          NaN         NaN         1739.0          3471.0              2827.0             NaN             NaN                  NaN           NaN          NaN


In [8]: # Melt ebola: ebola_melt
        ebola_melt = pd.melt(ebola, id_vars=["Date", "Day"], var_name="type_country", value_name="counts")
        
        # Create the 'str_split' column
        ebola_melt['str_split'] = ebola_melt['type_country'].str.split("_")
        
        # Create the 'type' column
        ebola_melt['type'] = ebola_melt['str_split'].str.get(0)
        
        # Create the 'country' column
        ebola_melt['country'] = ebola_melt['str_split'].str.get(1)
        
        # Print the head of ebola_melt
        print(ebola_melt.head())
         Date  Day  type_country  counts        str_split   type country
0    1/5/2015  289  Cases_Guinea  2776.0  [Cases, Guinea]  Cases  Guinea
1    1/4/2015  288  Cases_Guinea  2775.0  [Cases, Guinea]  Cases  Guinea
2    1/3/2015  287  Cases_Guinea  2769.0  [Cases, Guinea]  Cases  Guinea
3    1/2/2015  286  Cases_Guinea     NaN  [Cases, Guinea]  Cases  Guinea
4  12/31/2014  284  Cases_Guinea  2730.0  [Cases, Guinea]  Cases  Guinea

globbing

# Import necessary modules
import pandas as pd
import glob

# Write the pattern: pattern
pattern = '*.csv'

# Save all file matches: csv_files
csv_files = glob.glob(pattern)

# Print the file names
print(csv_files)

# Load the second file into a DataFrame: csv2
csv2 = pd.read_csv(csv_files[1])

# Print the head of csv2
print(csv2.head())


#  Iterate over csv_files
for csv in csv_files:

    #  Read csv into a DataFrame: df
    df = pd.read_csv(csv)
    
    # Append df to frames
    frames.append(df)

# Concatenate frames into a single DataFrame: uber
uber = pd.concat(frames)

# Print the shape of uber
print(uber.shape)

# Print the head of uber
print(uber.head())

Merging

In [7]: site
Out[7]: 
    name    lat    long
0   DR-1 -49.85 -128.57
1   DR-3 -47.15 -126.72
2  MSK-4 -48.87 -123.40

In [8]: visited
Out[8]: 
   ident   site       dated
0    619   DR-1  1927-02-08
1    622   DR-1  1927-02-10
2    734   DR-3  1939-01-07
3    735   DR-3  1930-01-12
4    751   DR-3  1930-02-26
5    752   DR-3         NaN
6    837  MSK-4  1932-01-14
7    844   DR-1  1932-03-22

In [9]: survey
Out[9]: 
    taken person quant  reading
0     619   dyer   rad     9.82
1     619   dyer   sal     0.13
2     622   dyer   rad     7.80
3     622   dyer   sal     0.09
4     734     pb   rad     8.41
5     734   lake   sal     0.05
6     734     pb  temp   -21.50
7     735     pb   rad     7.22
8     735    NaN   sal     0.06
9     735    NaN  temp   -26.00
10    751     pb   rad     4.35
11    751     pb  temp   -18.50
12    751   lake   sal     0.10
13    752   lake   rad     2.19
14    752   lake   sal     0.09
15    752   lake  temp   -16.00
16    752    roe   sal    41.60
17    837   lake   rad     1.46
18    837   lake   sal     0.21
19    837    roe   sal    22.50
20    844    roe   rad    11.25

In [6]: # Merge site and visited: m2m
        m2m = pd.merge(left=site, right=visited, left_on='name', right_on='site')
        
        # Merge m2m and survey: m2m
        m2m = pd.merge(left=m2m, right=survey, left_on="ident", right_on="taken")
        
        # Print the first 20 lines of m2m
        print(m2m.head(20))
     name    lat    long  ident   site       dated  taken person quant  reading
0    DR-1 -49.85 -128.57    619   DR-1  1927-02-08    619   dyer   rad     9.82
1    DR-1 -49.85 -128.57    619   DR-1  1927-02-08    619   dyer   sal     0.13
2    DR-1 -49.85 -128.57    622   DR-1  1927-02-10    622   dyer   rad     7.80
3    DR-1 -49.85 -128.57    622   DR-1  1927-02-10    622   dyer   sal     0.09
4    DR-1 -49.85 -128.57    844   DR-1  1932-03-22    844    roe   rad    11.25
5    DR-3 -47.15 -126.72    734   DR-3  1939-01-07    734     pb   rad     8.41
6    DR-3 -47.15 -126.72    734   DR-3  1939-01-07    734   lake   sal     0.05
7    DR-3 -47.15 -126.72    734   DR-3  1939-01-07    734     pb  temp   -21.50
8    DR-3 -47.15 -126.72    735   DR-3  1930-01-12    735     pb   rad     7.22
9    DR-3 -47.15 -126.72    735   DR-3  1930-01-12    735    NaN   sal     0.06
10   DR-3 -47.15 -126.72    735   DR-3  1930-01-12    735    NaN  temp   -26.00
11   DR-3 -47.15 -126.72    751   DR-3  1930-02-26    751     pb   rad     4.35
12   DR-3 -47.15 -126.72    751   DR-3  1930-02-26    751     pb  temp   -18.50
13   DR-3 -47.15 -126.72    751   DR-3  1930-02-26    751   lake   sal     0.10
14   DR-3 -47.15 -126.72    752   DR-3         NaN    752   lake   rad     2.19
15   DR-3 -47.15 -126.72    752   DR-3         NaN    752   lake   sal     0.09
16   DR-3 -47.15 -126.72    752   DR-3         NaN    752   lake  temp   -16.00
17   DR-3 -47.15 -126.72    752   DR-3         NaN    752    roe   sal    41.60
18  MSK-4 -48.87 -123.40    837  MSK-4  1932-01-14    837   lake   rad     1.46
19  MSK-4 -48.87 -123.40    837  MSK-4  1932-01-14    837   lake   sal     0.21

apply

In [2]: # Define recode_gender()
        def recode_gender(gender):
        
            # Return 0 if gender is 'Female'
            if gender == 'Male':
                return 1
            
            # Return 1 if gender is 'Male'    
            elif gender == 'Female':
                return 0
            
            # Return np.nan    
            else:
                return np.nan
        
        # Apply the function to the sex column
        tips['recode'] = tips.sex.apply(recode_gender)
        
        # Print the first five rows of tips
        print(tips.head())
   total_bill   tip     sex smoker  day    time  size  recode
0       16.99  1.01  Female     No  Sun  Dinner   2.0     0.0
1       10.34  1.66    Male     No  Sun  Dinner   3.0     1.0
2       21.01  3.50    Male     No  Sun  Dinner   3.0     1.0
3       23.68  3.31    Male     No  Sun  Dinner   2.0     1.0
4       24.59  3.61  Female     No  Sun  Dinner   4.0     0.0

In [3]: tips.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
total_bill    202 non-null float64
tip           220 non-null float64
sex           234 non-null object
smoker        229 non-null object
day           243 non-null object
time          227 non-null object
size          231 non-null float64
recode        234 non-null float64
dtypes: float64(4), object(4)
memory usage: 15.3+ KB







In [6]: g1800s.head()
Out[6]: 
         Life expectancy  ...     1899
0               Abkhazia  ...      NaN
1            Afghanistan  ...    27.20
2  Akrotiri and Dhekelia  ...      NaN
3                Albania  ...    35.40
4                Algeria  ...    28.82

[5 rows x 101 columns]
In [3]: g1800s.describe()
Out[3]: 
             1800     ...            1899
count  201.000000     ...      201.000000
mean    31.486020     ...       32.962985
std      3.763585     ...        6.325265
min     23.390000     ...       18.900000
25%     29.000000     ...       29.000000
50%     31.800000     ...       32.000000
75%     33.900000     ...       35.180000
max     42.850000     ...       51.620000

[8 rows x 100 columns]

In [4]: g1800s.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Columns: 101 entries, Life expectancy to 1899
dtypes: float64(100), object(1)
memory usage: 205.2+ KB

In [5]: g1800s.shape
Out[5]: (260, 101)

In [2]: def check_null_or_valid(row_data):
            """Function that takes a row of data,
            drops all missing values,
            and checks if all remaining values are greater than or equal to 0
            """
            no_na = row_data.dropna()
            numeric = pd.to_numeric(no_na)
            ge0 = numeric >= 0
            return ge0
        
        # Check whether the first column is 'Life expectancy'
        assert g1800s.columns[0] == 'Life expectancy'
        
        # Check whether the values in the row are valid
        assert g1800s.iloc[:, 1:].apply(check_null_or_valid, axis=1).all().all()
        
        # Check that there is only one instance of each country
        assert g1800s['Life expectancy'].value_counts()[0] == 1





In [1]: # Create the series of countries: countries
        countries = gapminder['country']
        
        # Drop all the duplicates from countries
        countries = countries.drop_duplicates()
        
        # Write the regular expression: pattern
        pattern = '^[A-Za-z\.\s]*$'
        
        # Create the Boolean vector: mask
        mask = countries.str.contains(pattern)
        
        # Invert the mask: mask_inverse
        mask_inverse = ~mask
        
        # Subset countries using mask_inverse: invalid_countries
        invalid_countries = countries.loc[mask_inverse]
        
        # Print invalid_countries
        print(invalid_countries)
49            Congo, Dem. Rep.
50                 Congo, Rep.
53               Cote d'Ivoire
73      Falkland Is (Malvinas)
93               Guinea-Bissau
98            Hong Kong, China
118    United Korea (former)\n
131               Macao, China
132             Macedonia, FYR
145      Micronesia, Fed. Sts.
161            Ngorno-Karabakh
187             St. Barthélemy
193     St.-Pierre-et-Miquelon
225                Timor-Leste
251      Virgin Islands (U.S.)
252       North Yemen (former)
253       South Yemen (former)
258                      Åland
Name: country, dtype: object


In [1]: gapminder.shape
Out[1]: (169260, 3)

In [2]: # Assert that country does not contain any missing values
        assert pd.notnull(gapminder.country).all()
        
        # Assert that year does not contain any missing values
        assert pd.notnull(gapminder.year).all()
        
        # Drop the missing values
        gapminder = gapminder.dropna()
        
        # Print the shape of gapminder
        print(gapminder.shape)
(43857, 3)

In [8]: # Add first subplot
        plt.subplot(2, 1, 1) 
        
        # Create a histogram of life_expectancy
        gapminder.life_expectancy.plot(kind='hist')
        
        # Group gapminder: gapminder_agg
        gapminder_agg = gapminder.groupby('year')['life_expectancy'].mean()
        
        # Print the head of gapminder_agg
        print(gapminder_agg.head())
        
        # Print the tail of gapminder_agg
        print(gapminder_agg.tail())
        
        # Add second subplot
        plt.subplot(2, 1, 2)
        
        # Create a line plot of life expectancy per year
        gapminder_agg.plot()
        
        # Add title and specify axis labels
        plt.title('Life expectancy over the years')
        plt.ylabel('Life expectancy')
        plt.xlabel('Year')
        
        # Display the plots
        plt.tight_layout()
        plt.show()
        
        # Save both DataFrames to csv files
        gapminder.to_csv("gapminder.csv")
        gapminder_agg.to_csv("gapminder_agg.csv")
year
1800    31.486020
1801    31.448905
1802    31.463483
1803    31.377413
1804    31.446318
Name: life_expectancy, dtype: float64
year
2012    71.663077
2013    71.916106
2014    72.088125
2015    72.321010
2016    72.556635
Name: life_expectancy, dtype: float64

<script.py> output:
    year
    1800    31.486020
    1801    31.448905
    1802    31.463483
    1803    31.377413
    1804    31.446318
    Name: life_expectancy, dtype: float64
    year
    2012    71.663077
    2013    71.916106
    2014    72.088125
    2015    72.321010
    2016    72.556635
    Name: life_expectancy, dtype: float64

PANDAS FOUNDATION

create a df from lists using zip()

In [24]: list_keys
Out[24]: ['Country', 'Total']

In [25]: list_values
Out[25]: [['United States', 'Soviet Union', 'United Kingdom'], [1118, 473, 273]]

# Zip the 2 lists together into one list of (key,value) tuples: zipped
zipped = list(zip(list_keys, list_values))

# Inspect the list using print()
print(zipped)

# Build a dictionary with the zipped list: data
data = dict(zipped)

# Build and inspect a DataFrame from the dictionary: df
df = pd.DataFrame(data)
print(df)

    [('Country', ['United States', 'Soviet Union', 'United Kingdom']), ('Total', [1118, 473, 273])]
              Country  Total
    0   United States   1118
    1    Soviet Union    473
    2  United Kingdom    273

pandas visualization

# Create a list of y-axis column names: y_columns
y_columns = ["AAPL", "IBM"]

# Generate a line plot
df.plot(x="Month", y=y_columns)

# Generate a scatter plot
df.plot(kind="scatter", x='hp', y='mpg', s=sizes)

'''
While pandas can plot multiple columns of data in a single figure, 
making plots that share the same x and y axes, there are cases where 
two columns cannot be plotted together because their units do not match. 
The .plot() method can generate subplots for each column being plotted. 
Here, each plot will be scaled independently.
'''
# Make a list of the column names to be plotted: cols
cols = ["weight", "mpg"]
# Generate the box plots
df[cols].plot(kind="box", subplots=True)


# This formats the plots such that they appear on separate rows
fig, axes = plt.subplots(nrows=2, ncols=1)

# Plot the PDF
df.fraction.plot(ax=axes[0], kind='hist', bins=30, normed=True, range=(0,.3))
plt.show()

# Plot the CDF
df.fraction.plot(ax=axes[1], kind='hist', normed=True, cumulative=True, bins=30, range=(0,.3))


# Add the title
plt.title('Monthly stock prices')

# Add the y-axis label
plt.ylabel('Price ($US)')

# Display the plot
plt.show()

pandas descriptive statistics

#TODO merge with previous content #TODO add value_count()

print(df.quantile([0.05, 0.95]))

pandas timeseries

# Reindex without fill method: ts3
ts3 = ts2.reindex(ts1.index)

# Reindex with fill method, using forward fill: ts4
ts4 = ts2.reindex(ts1.index, method="ffill")

# Combine ts1 + ts2: sum12
sum12 = ts1 +ts2

# Combine ts1 + ts3: sum13
sum13 = ts1 +ts3

# Combine ts1 + ts4: sum14
sum14 = ts1+ts4

In [2]: ts1.head()
Out[2]: 
2016-07-01    0
2016-07-02    1
2016-07-03    2
2016-07-04    3
2016-07-05    4
dtype: int64

In [3]: ts2.head()
Out[3]: 
2016-07-01    0
2016-07-04    1
2016-07-05    2
2016-07-06    3
2016-07-07    4
dtype: int64

In [4]: sum14.head()
Out[4]: 
2016-07-01    0
2016-07-02    1
2016-07-03    2
2016-07-04    4
2016-07-05    6
dtype: int64


# Downsample to 6 hour data and aggregate by mean: df1
df1 = df.Temperature.resample("6H").mean()

# Downsample to daily data and count the number of data points: df2
df2 = df.Temperature.resample("D").count()


# Extract data from 2010-Aug-01 to 2010-Aug-15: unsmoothed
unsmoothed = df['Temperature']["2010-Aug-01":"2010-Aug-15"]

# Apply a rolling mean with a 24 hour window: smoothed
smoothed = unsmoothed.rolling(window=24).mean()

# Create a new DataFrame with columns smoothed and unsmoothed: august
august = pd.DataFrame({'smoothed':smoothed, 'unsmoothed': unsmoothed})

# Plot both smoothed and unsmoothed data using august.plot().
august.plot()
plt.show()

# Reset the index of ts2 to ts1, and then use linear interpolation to fill in the NaNs: ts2_interp
ts2_interp = ts2.reindex(ts1.index).interpolate(how="linear")

# Build a Boolean mask to filter out all the 'LAX' departure flights: mask
mask = df['Destination Airport'] == "LAX"

# Use the mask to subset the data: la
la = df[mask]

# Combine two columns of data to create a datetime series: times_tz_none 
times_tz_none = pd.to_datetime( la['Date (MM/DD/YYYY)'] + " " + la['Wheels-off Time'] )

# Localize the time to US/Central: times_tz_central
times_tz_central = times_tz_none.dt.tz_localize("US/Central")

# Convert the datetimes from US/Central to US/Pacific
times_tz_pacific = times_tz_central.dt.tz_convert("US/Pacific")


TODO move to cleaning data
# Read in the data file with header=None: df
df = pd.read_csv(data_file, header=None)

# Remove the appropriate columns: df_dropped
df_dropped = df.drop(list_to_drop, axis="columns")

# Convert the date column to string: df_dropped['date']
df_dropped['date'] = df_dropped['date'].astype(str)

# Pad leading zeros to the Time column: df_dropped['Time']
df_dropped['Time'] = df_dropped['Time'].apply(lambda x:'{:0>4}'.format(x))

# Concatenate the new date and Time columns: date_string
date_string = df_dropped.date  + df_dropped.Time

# Convert the date_string Series to datetime: date_times
date_times = pd.to_datetime(date_string, format='%Y%m%d%H%M')

# Set the index to be the new date_times container: df_clean
df_clean = df_dropped.set_index(date_times)

# Convert the dry_bulb_faren column to numeric values: df_clean['dry_bulb_faren']
df_clean['dry_bulb_faren'] = pd.to_numeric(df_clean['dry_bulb_faren'], errors="coerce")

# Convert the wind_speed and dew_point_faren columns to numeric values
df_clean['wind_speed'] = pd.to_numeric(df_clean.wind_speed, errors="coerce")
df_clean['dew_point_faren'] = pd.to_numeric(df_clean.dew_point_faren, errors="coerce")


# From previous steps
is_sky_clear = df_clean['sky_condition']=='CLR'
sunny = df_clean.loc[is_sky_clear]
sunny_daily_max = sunny.resample('D').max()
is_sky_overcast = df_clean['sky_condition'].str.contains('OVC')
overcast = df_clean.loc[is_sky_overcast]
overcast_daily_max = overcast.resample('D').max()

# Calculate the mean of sunny_daily_max
sunny_daily_max_mean = sunny_daily_max.mean()

# Calculate the mean of overcast_daily_max
overcast_daily_max_mean = overcast_daily_max.mean()

# Print the difference (sunny minus overcast)
print(sunny_daily_max_mean - overcast_daily_max_mean)

PANDAS MANIPULATION

Slicing

In [3]: election.head()
Out[3]: 
          state   total      Obama     Romney  winner  voters
county                                                       
Adams        PA   41973  35.482334  63.112001  Romney   61156
Allegheny    PA  614671  56.640219  42.185820   Obama  924351
Armstrong    PA   28322  30.696985  67.901278  Romney   42147
Beaver       PA   80015  46.032619  52.637630  Romney  115157
Bedford      PA   21444  22.057452  76.986570  Romney   32189

# Slice the row labels 'Perry' to 'Potter': p_counties
p_counties = election.loc['Perry':'Potter', :]

# Slice the row labels 'Potter' to 'Perry' in reverse order: p_counties_rev
p_counties_rev = election.loc['Potter': 'Perry': -1,:]


# Slice the columns from the starting column to 'Obama': left_columns
left_columns = election.loc[:, :'Obama']

# Slice the columns from 'Obama' to 'winner': middle_columns
middle_columns = election.loc[:, 'Obama':'winner']

# Slice the columns from 'Romney' to the end: 'right_columns'
right_columns = election.loc[:, 'Romney':]

# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']

# Create the list of column labels: cols
cols = ['winner', 'Obama', 'Romney']

# Create the new DataFrame: three_counties
three_counties = election.loc[rows, cols]

# Create the boolean array: high_turnout
high_turnout = election['turnout'] > 70

# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election[high_turnout]

# Create the boolean array: too_close
too_close = election['margin'] < 1

# Assign np.nan to the 'winner' column where the results were too close to call
election['winner'][too_close] = np.nan


# Select the 'age' and 'cabin' columns: df
df = titanic.loc[:, ['age','cabin']]

# Print the shape of df
print(df.shape)

# Drop rows in df with how='any' and print the shape
print(df.dropna(how='any').shape)

# Drop rows in df with how='all' and print the shape
print(df.dropna(how="all").shape)

# Drop columns in titanic with less than 1000 non-missing values
print(titanic.dropna(thresh=1000, axis='columns').info())

# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF', 'Mean Dew PointF']].apply(to_celsius)

# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama':'blue', \
               'Romney':'red'}

# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election['winner'].map(red_vs_blue)

vectorized functions

# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

pandas Index and MultiIndex

In [1]: sales
Out[1]: 
  state  month  eggs  salt  spam
0    CA      1    47  12.0    17
1    CA      2   110  50.0    31
2    NY      1   221  89.0    72
3    NY      2    77  87.0    20
4    TX      1   132   NaN    52
5    TX      2   205  60.0    55

In [5]: # Set the index to be the columns ['state', 'month']: sales
        sales  = sales.set_index(['state', 'month'])
        
        # Sort the MultiIndex: sales
        sales = sales.sort_index()
        
        # Print the sales DataFrame
        print(sales)
             eggs  salt  spam
state month                  
CA    1        47  12.0    17
      2       110  50.0    31
NY    1       221  89.0    72
      2        77  87.0    20
TX    1       132   NaN    52
      2       205  60.0    55

In [23]: # Look up data for NY in month 1: NY_month1

In [25]: sales.loc[('NY', 1), :]
Out[25]: 
eggs    221.0
salt     89.0
spam     72.0
Name: (NY, 1), dtype: float64

In [26]: # Look up data for CA and TX in month 2: CA_TX_month2

In [27]: sales.loc[(['CA','TX'], 2), :]
Out[27]: 
             eggs  salt  spam
state month                  
CA    2       110  50.0    31
TX    2       205  60.0    55

In [28]: # Look up data for all states in month 2: all_month2

In [29]: sales.loc[(slice(None), 2), :]
Out[29]: 
             eggs  salt  spam
state month                  
CA    2       110  50.0    31
NY    2        77  87.0    20
TX    2       205  60.0    55

stacking and unstacking

In [6]: users
Out[6]: 
                visitors  signups
city   weekday                   
Austin Mon           326        3
       Sun           139        7
Dallas Mon           456        5
       Sun           237       12

In [7]: users.unstack(level='weekday')
Out[7]: 
        visitors      signups    
weekday      Mon  Sun     Mon Sun
city                             
Austin       326  139       3   7
Dallas       456  237       5  12

In [8]: byweeekday = users.unstack(level='weekday')

In [9]: byweekday.stack(level='weekday')
Out[9]: 
                visitors  signups
city   weekday                   
Austin Mon           326        3
       Sun           139        7
Dallas Mon           456        5
       Sun  
       


In [1]: visitors_by_city_weekday
Out[1]: 
city     Austin  Dallas
weekday                
Mon         326     456
Sun         139     237

# Reset the index: visitors_by_city_weekday
visitors_by_city_weekday = visitors_by_city_weekday.reset_index() 

# Print visitors_by_city_weekday
print(visitors_by_city_weekday)

# Melt visitors_by_city_weekday: visitors
visitors = pd.melt(visitors_by_city_weekday, id_vars='weekday', value_name='visitors')

# Print visitors
print(visitors)
<script.py> output:
    city weekday  Austin  Dallas
    0        Mon     326     456
    1        Sun     139     237
      weekday    city  visitors
    0     Mon  Austin       326
    1     Sun  Austin       139
    2     Mon  Dallas       456
    3     Sun  Dallas       237
    

groupby

# Group titanic by 'pclass': by_class
by_class = titanic.groupby('pclass')

# Select 'age' and 'fare'
by_class_sub = by_class[['age','fare']]

# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(['max', 'median'])

# Print the maximum age in each class
print(aggregated.loc[:, ('age','max')])

# Print the median fare in each class
print(aggregated.loc[:, ('fare','median')])




# Read the CSV file into a DataFrame and sort the index: gapminder
gapminder = pd.read_csv('gapminder.csv', index_col=['Year','region','Country']).sort_index()

# Group gapminder by 'Year' and 'region': by_year_region
by_year_region = gapminder.groupby(level=['Year', 'region'])

# Define the function to compute spread: spread
def spread(series):
    return series.max() - series.min()

# Create the dictionary: aggregator
aggregator = {'population':'sum', 'child_mortality':'mean', 'gdp':spread}

# Aggregate by_year_region using the dictionary: aggregated
aggregated = by_year_region.agg(aggregator)

# Print the last 6 entries of aggregated 
print(aggregated.tail(6))

                                          gdp  child_mortality    population
    Year region                                                             
    2013 America                      49634.0        17.745833  9.629087e+08
         East Asia & Pacific         134744.0        22.285714  2.244209e+09
         Europe & Central Asia        86418.0         9.831875  8.968788e+08
         Middle East & North Africa  128676.0        20.221500  4.030504e+08
         South Asia                   11469.0        46.287500  1.701241e+09
         Sub-Saharan Africa           32035.0        76.944490  9.205996e+08
     
# GROUP BY DATETIME     
# Read file: sales
sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)

# Create a groupby object: by_day
by_day = sales.groupby(sales.index.strftime('%a'))

# Create sum: units_sum
units_sum = by_day['Units'].sum()

# Print units_sum
print(units_sum)
<script.py> output:
    Mon    48
    Sat     7
    Thu    59
    Tue    13
    Wed    48
    Name: Units, dtype: int64



# Import zscore
from scipy.stats import zscore

# Group gapminder_2010: standardized
standardized = gapminder_2010.groupby('region')['life','fertility'].transform(zscore)

# Construct a Boolean Series to identify outliers: outliers
outliers = (standardized['life'] < -3) | (standardized['fertility'] > 3)

# Filter gapminder_2010 by the outliers: gm_outliers
gm_outliers = gapminder_2010.loc[outliers]

# Print gm_outliers
print(gm_outliers)



# Read the CSV file into a DataFrame: sales
sales = pd.read_csv('sales.csv', index_col='Date', parse_dates=True)

# Group sales by 'Company': by_company
by_company = sales.groupby('Company')

# Compute the sum of the 'Units' of by_company: by_com_sum
by_com_sum = by_company['Units'].sum()
print(by_com_sum)

# Filter 'Units' where the sum is > 35: by_com_filt
by_com_filt = by_company.filter(lambda g:g['Units'].sum() > 35)
print(by_com_filt)

In [1]: medals.head()
Out[1]: 
     City  Edition     Sport Discipline             Athlete  NOC Gender                       Event Event_gender   Medal
0  Athens     1896  Aquatics   Swimming       HAJOS, Alfred  HUN    Men              100m freestyle            M    Gold
1  Athens     1896  Aquatics   Swimming    HERSCHMANN, Otto  AUT    Men              100m freestyle            M  Silver
2  Athens     1896  Aquatics   Swimming   DRIVAS, Dimitrios  GRE    Men  100m freestyle for sailors            M  Bronze
3  Athens     1896  Aquatics   Swimming  MALOKINIS, Ioannis  GRE    Men  100m freestyle for sailors            M    Gold
4  Athens     1896  Aquatics   Swimming  CHASAPIS, Spiridon  GRE    Men  100m freestyle for sailors            M  Silver

In [2]: medals.loc[medals.NOC == 'USA'].groupby('Edition')['Medal'].count()
Out[2]: 
Edition
1896     20
1900     55
1904    394
1908     63
1912    101
1920    193
1924    198
1928     84
1932    181
1936     92
1948    148
1952    130
1956    118
1960    112
1964    150
1968    149
1972    155
1976    155
1984    333
1988    193
1992    224
1996    260
2000    248
2004    264
2008    315



In [4]: # Select the 'NOC' column of medals: country_names
        country_names = medals['NOC']
        
        # Count the number of medals won by each country: medal_counts
        medal_counts = country_names.value_counts()
        
        # Print top 15 countries ranked by medals
        print(medal_counts.head(15))

<script.py> output:
    USA    4335
    URS    2049
    GBR    1594
    FRA    1314
    ITA    1228
    
    
    
In [4]: # Construct the pivot table: counted
        counted = medals.pivot_table(index='NOC', columns='Medal', values='Athlete', aggfunc='count')
        
        # Create the new column: counted['totals']
        counted['totals'] = counted.sum(axis='columns')
        
        # Sort counted by the 'totals' column
        counted = counted.sort_values('totals', ascending=False)
        
        # Print the top 15 rows of counted
        print(counted.head(15))
Medal  Bronze    Gold  Silver  totals
NOC                                  
USA    1052.0  2088.0  1195.0  4335.0
URS     584.0   838.0   627.0  2049.0
GBR     505.0   498.0   591.0  1594.0
FRA     475.0   378.0   461.0  1314.0
ITA     374.0   460.0   394.0  1228.0




In [1]: # Select columns: ev_gen
        ev_gen = medals[['Event_gender', 'Gender']]
        
        # Drop duplicate pairs: ev_gen_uniques
        ev_gen_uniques = ev_gen.drop_duplicates()
        
        # Print ev_gen_uniques
        print(ev_gen_uniques)
      Event_gender Gender
0                M    Men
348              X    Men
416              W  Women
639              X  Women
23675            W    Men


In [13]: medals[(medals.Event_gender == 'W') & (medals.Gender == 'Men')]
Out[13]: 
         City  Edition      Sport Discipline            Athlete  NOC Gender     Event Event_gender   Medal
23675  Sydney     2000  Athletics  Athletics  CHEPCHUMBA, Joyce  KEN    Men  marathon            W  Bronze



In [2]: # Group medals by 'NOC': country_grouped
        country_grouped = medals.groupby('NOC')
        
        # Compute the number of distinct sports in which each country won medals: Nsports
        Nsports = country_grouped['Sport'].nunique()
        
        # Sort the values of Nsports in descending order
        Nsports = Nsports.sort_values(ascending=False)
        
        # Print the top 15 rows of Nsports
        print(Nsports.head(15))
NOC
USA    34
GBR    31
FRA    28
GER    26
CHN    24



In [3]: # Extract all rows for which the 'Edition' is between 1952 & 1988: during_cold_war
        during_cold_war = (medals['Edition'] >= 1952) & (medals['Edition'] <= 1988)
        
        # Extract rows for which 'NOC' is either 'USA' or 'URS': is_usa_urs
        is_usa_urs = medals.NOC.isin(['USA','URS'])
        
        # Use during_cold_war and is_usa_urs to create the DataFrame: cold_war_medals
        cold_war_medals = medals.loc[during_cold_war & is_usa_urs]
        
        # Group cold_war_medals by 'NOC'
        country_grouped = cold_war_medals.groupby('NOC')
        
        # Create Nsports
        Nsports = country_grouped['Sport'].nunique().sort_values(ascending=False)
        
        # Print Nsports
        print(Nsports)
NOC
URS    21
USA    20
Name: Sport, dtype: int64


In [10]: medals.pivot_table(index='Edition', columns='NOC', values='Athlete', aggfunc='count').loc[1952:1988, ['USA','URS']]
Out[10]: 
NOC        USA    URS
Edition              
1952     130.0  117.0
1956     118.0  169.0
1960     112.0  169.0
1964     150.0  174.0
1968     149.0  188.0
1972     155.0  211.0
1976     155.0  285.0
1980       NaN  442.0
1984     333.0    NaN
1988     193.0  294.0


# Create the DataFrame: usa
usa = medals[medals['NOC'] == 'USA']

# Group usa by ['Edition', 'Medal'] and aggregate over 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'] )['Athlete'].count()

# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')

# Plot the DataFrame usa_medals_by_year
usa_medals_by_year.plot()
plt.show()

USEFUL FUNCTIONS

pd.set_option('display.expand_frame_repr', False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment