ktl014/imData_APISJson.py

## imData_APISJson.py
# ================== Loading & Exploring JSON ================== #
# Load JSON: json_data
with open("a_movie.json") as json_file:
    json_data = json.load(json_file)

# Print each key-value pair in json_data
for k in json_data.keys():
    print(k + ': ', json_data[k])

# ================== Query API & decode JSON data ================== #

# Import package
import requests

# Assign URL to variable: url
'''
 API query string should have two arguments: apikey=ff21610b and t=social+network
'''
url = 'http://www.omdbapi.com/?apikey=ff21610b&t=social+network'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Decode the JSON data into a dictionary: json_data
json_data = r.json()

# Print each key-value pair in json_data
for k in json_data.keys():
    print(k + ': ', json_data[k])

# ----------- A) Wikipedia API ------- #
# Assign URL to variable: url (CHECK URL FOR WIKI API)
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Decode the JSON data into a dictionary: json_data
json_data = r.json()

# Print the Wikipedia page extract
pizza_extract = json_data['query']['pages']['24768']['extract']
print(pizza_extract)

# ----------- B) Twitter API ------- #
# Import package
import tweepy

# -_-_- i) Handle Twitter API OAuth Authentication
# Store OAuth authentication credentials in relevant variables
access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"

# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# -_-_- ii) Stream Tweets

# Initialize Stream listener (CHECK Tweet_listener.py for class object)
l = MyStreamListener()

# Create you Stream object with authentication
stream = tweepy.Stream(auth, l)

# Filter Twitter Streams to capture data by the keywords:
stream.filter(track=['clinton', 'trump', 'sanders', 'cruz'])

# -_-_- iii) Load & Explore Twitter Data
# Twitter data saved to text file from previous step

# Import package
import json

# String of path to file: tweets_data_path
tweets_data_path = 'tweets.txt'

# Initialize empty list to store tweets: tweets_data
tweets_data = []

# Open connection to file
tweets_file = open(tweets_data_path, "r")

# Read in tweets and store in list: tweets_data
for line in tweets_file:
    tweet = json.loads(line)
    tweets_data.append(tweet)

# Close connection to file
tweets_file.close()

# Print the keys of the first tweet dict
print(tweets_data[0].keys())

# -_-_- iv) Twitter data to Dataframe

# Import package
import pandas as pd

# Build DataFrame of tweet texts and languages
df = pd.DataFrame(tweets_data, columns=['text', 'lang'])

# Print head of DataFrame
print(df.head())

# -_-_- v) Twitter Analysis


# Initialize list to store tweet counts
[clinton, trump, sanders, cruz] = [0, 0, 0, 0]

import re

def word_in_text(word, tweet):
    word = word.lower()
    text = tweet.lower()
    match = re.search(word, tweet)

    if match:
        return True
    return False

# Iterate through df, counting the number of tweets in which
# each candidate is mentioned
for index, row in df.iterrows():
    clinton += word_in_text('clinton', row['text'])
    trump += word_in_text('trump', row['text'])
    sanders += word_in_text('sanders', row['text'])
    cruz += word_in_text('cruz', row['text'])

# -_-_- v) Plot Twitter Data

# Import packages
import seaborn as sns
import matplotlib.pyplot as plt

# Set seaborn style
sns.set(color_codes=True)

# Create a list of labels:cd
cd = ['clinton', 'trump', 'sanders', 'cruz']

# Plot histogram
ax = sns.barplot(cd, [clinton, trump, sanders, cruz])
ax.set(ylabel="count")
plt.show()

## imData_npfileload.py
# Import numpy
import numpy as np

# Assign the filename: file
file = 'digits_header.txt'

# Load the data: data
'''
 for example, you can use ',' and '\t' for comma-delimited and tab-delimited respectively;
 skiprows allows you to specify how many rows (not indices) you wish to skip;
 usecols takes a list of the indices of the columns you wish to keep.
'''
data = np.loadtxt(file, delimiter='\t', skiprows=1, usecols=[0,2])

# Print data
print(data)

# Import data as floats and skip the first row: data_float
data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1)

# Print the 10th element of data_float
print(data_float[9])

# Plot a scatterplot of the data
plt.scatter(data_float[:, 0], data_float[:, 1])
plt.xlabel('time (min.)')
plt.ylabel('percentage of larvae')
plt.show()

# Import data containing mixed datatypes (i.e. one column containing strings and another floats)
data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None)

## imData_otherFlatF.py
# ================== Loading a pickled file ================== #
# Import pickle package
import pickle

# Open pickle file and load data: d
with open('data.pkl', "rb") as file:
    d = pickle.load(file)

# Print d
print(d)

# Print datatype of d
print(type(d))

# ================== Loading sheets in Excel Files ================== #
# Import pandas to import excel spreadsheets
import pandas as pd

# Assign spreadsheet filename: file
file = 'battledeath.xlsx'

# Load spreadsheet: xl
xl = pd.ExcelFile(file)

# Print sheet names
print(xl.sheet_names)

# ----------- A) Parse sheets into dataframes ------- #
# Parse the first sheet and rename the columns: df1
df1 = xl.parse(0, skiprows=[0], names=['Country', 'AAM due to War (2002)'])

# Print the head of the DataFrame df1
print(df1.head())

# Parse the first column of the second sheet and rename the column: df2
df2 = xl.parse(1, parse_cols=[0], skiprows=[0], names=['Country'])

# Print the head of the DataFrame df2
print(df2.head())

# ================== Importing SAS/STATA Files ================== #
# Import sas7bdat package
from sas7bdat import SAS7BDAT

# Save file to a DataFrame: df_sas
with SAS7BDAT('sales.sas7bdat') as file:
    df_sas = file.to_data_frame()

# Print head of DataFrame
print(df_sas.head())

# Plot histogram of DataFrame features (pandas and pyplot already imported)
pd.DataFrame.hist(df_sas[['P']])
plt.ylabel('count')
plt.show()

# Load Stata file into a pandas DataFrame: df
df = pd.read_stata('disarea.dta')

# Print the head of the DataFrame df
print(df.head())

# Plot histogram of one column of the DataFrame
pd.DataFrame.hist(df[['disa10']])
plt.xlabel('Extent of disease')
plt.ylabel('Number of coutries')
plt.show()

# ================== Importing HDF5 Files ================== #
# Import packages
import numpy as np
import h5py

# Assign filename: file
file = 'LIGO_data.hdf5'

# Load file: data
data = h5py.File(file, 'r')

# Print the datatype of the loaded file
print(type(data))

# Print the keys of the file
for key in data.keys():
    print(key)

# ----------- A) Extracting data from HDF5 File ------- #
# Get the HDF5 group: group
group = data['strain']

# Set variable equal to time series data: strain
strain = data['strain']['Strain'].value

# Set number of time points to sample: num_samples
num_samples = 10000

# Set time vector
time = np.arange(0, 1, 1/num_samples)

# Plot data
plt.plot(time, strain[:num_samples])
plt.xlabel('GPS Time (s)')
plt.ylabel('strain')
plt.show()

# ================== Importing MATLAB Files ================== #
# Import package
import scipy.io

# Load MATLAB file: mat
mat = scipy.io.loadmat('albeck_gene_expression.mat')

# Print the datatype type of mat
print(type(mat))

## imData_pandasLoad.py
# Assign the filename: file
file = 'digits.csv'

# Read the first 5 rows of the file into a DataFrame: data
data = pd.read_csv(file, nrows=5, header=None)

# Build a numpy array from the DataFrame: data_array
data_array = data.values

# Print the datatype of data_array to the shell
print(type(data_array))

# ================== Customizing Pandas Import ================== #
# Import file: data (handle comments, empty lines, missing values)
data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')

# Print the head of the DataFrame
print(data.head())

# Plot 'Age' variable in a histogram
pd.DataFrame.hist(data[['Age']])
plt.xlabel('Age (years)')
plt.ylabel('count')
plt.show()

## imData_SQLqueries.py
# ================== SQL Query Databases ================== #
# Import packages
from sqlalchemy import create_engine
import pandas as pd

# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')

# Open engine connection: con
con = engine.connect()

# Perform query: rs
rs = con.execute('SELECT * FROM Album')

# Save results of the query to DataFrame: df
df = pd.DataFrame(rs.fetchall())

# Close connection
con.close()

# Print head of DataFrame df
print(df.head())

# ----------- A) Customize SQl query results ------- #

# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')

# Open engine in context manager
with engine.connect() as con:
    rs = con.execute('SELECT * FROM Employee ORDER BY BirthDate')
    df = pd.DataFrame(rs.fetchall())

    # Set the DataFrame's column names
    df.columns = rs.keys()

# Print head of DataFrame
print(df.head())

# ----------- B) Write results of SQL Query directly to DataFrame ------- #
# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')

# Execute query and store records in DataFrame: df
df = pd.read_sql_query("SELECT * FROM Album", engine)

# Print head of DataFrame
print(df.head())

# ----------- C) Relationships between tables: INNER JOIN ------- #

# Open engine in context manager
# Perform query and save results to DataFrame: df
with engine.connect() as con:
    rs = con.execute("SELECT Title, Name FROM Album INNER JOIN Artist on Album.ArtistID = Artist.ArtistID")
    df = pd.DataFrame(rs.fetchall())
    df.columns = rs.keys()

# Print head of DataFrame df
print(df.head())

## imData_webfiles.py
# ================== Files from Web ================== #
# ----------- A) Flat Files ------- #
# Import packages
import matplotlib.pyplot as plt
import pandas as pd

# Assign url of file: url
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'

# Read file into a DataFrame: df
df = pd.read_csv(url, sep=';')

# ----------- B) NonFlat Files ------- #
# Assign url of file: url
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'

# Read in all sheets of Excel file: xl
xl = pd.read_excel(url, sheetname=None)

# Print the sheetnames to the shell
print(xl.keys())

# Print the head of the first sheet (using its name, NOT its index)
print(xl['1700'].head())

# ================== HTTP Requests ================== #
# Import package
import requests

# Specify the url: url
url = "http://www.datacamp.com/teach/documentation"

# Packages the request, send the request and catch the response: r
r = requests.get(url)

# Extract the response: text
html_doc = r.text

# Print the html
print(html_doc)

# ----------- A) Parsing HTML with BeautifulSoup ------- #
# create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)

# Print the title of Guido's webpage
print(soup.title)

# Find all 'a' tags (which define hyperlinks): a_tags
a_tags = soup.find_all('a')

# Print the URLs to the shell
for link in a_tags:
    print(link.get('href'))


## notes.txt
What I learned:
- importing text files and flat files
- importing files in other formats
- writing SQL queries
- Getting data from relational databases
- Pulling data from the web
- Pulling data from APIs

## tweet_listener.py
class MyStreamListener(tweepy.StreamListener):
    def __init__(self, api=None):
        super(MyStreamListener, self).__init__()
        self.num_tweets = 0
        self.file = open("tweets.txt", "w")

    def on_status(self, status):
        tweet = status._json
        self.file.write( json.dumps(tweet) + '\n' )
        self.num_tweets += 1
        if self.num_tweets < 100:
            return True
        else:
            return False
        self.file.close()

    def on_error(self, status):
        print(status)
	# ================== Loading & Exploring JSON ================== #
	# Load JSON: json_data
	with open("a_movie.json") as json_file:
	json_data = json.load(json_file)

	# Print each key-value pair in json_data
	for k in json_data.keys():
	print(k + ': ', json_data[k])

	# ================== Query API & decode JSON data ================== #

	# Import package
	import requests

	# Assign URL to variable: url
	'''
	API query string should have two arguments: apikey=ff21610b and t=social+network
	'''
	url = 'http://www.omdbapi.com/?apikey=ff21610b&t=social+network'

	# Package the request, send the request and catch the response: r
	r = requests.get(url)

	# Decode the JSON data into a dictionary: json_data
	json_data = r.json()

	# Print each key-value pair in json_data
	for k in json_data.keys():
	print(k + ': ', json_data[k])

	# ----------- A) Wikipedia API ------- #
	# Assign URL to variable: url (CHECK URL FOR WIKI API)
	url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'

	# Package the request, send the request and catch the response: r
	r = requests.get(url)

	# Decode the JSON data into a dictionary: json_data
	json_data = r.json()

	# Print the Wikipedia page extract
	pizza_extract = json_data['query']['pages']['24768']['extract']
	print(pizza_extract)

	# ----------- B) Twitter API ------- #
	# Import package
	import tweepy

	# -_-_- i) Handle Twitter API OAuth Authentication
	# Store OAuth authentication credentials in relevant variables
	access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
	access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
	consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
	consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"

	# Pass OAuth details to tweepy's OAuth handler
	auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
	auth.set_access_token(access_token, access_token_secret)

	# -_-_- ii) Stream Tweets

	# Initialize Stream listener (CHECK Tweet_listener.py for class object)
	l = MyStreamListener()

	# Create you Stream object with authentication
	stream = tweepy.Stream(auth, l)

	# Filter Twitter Streams to capture data by the keywords:
	stream.filter(track=['clinton', 'trump', 'sanders', 'cruz'])

	# -_-_- iii) Load & Explore Twitter Data
	# Twitter data saved to text file from previous step

	# Import package
	import json

	# String of path to file: tweets_data_path
	tweets_data_path = 'tweets.txt'

	# Initialize empty list to store tweets: tweets_data
	tweets_data = []

	# Open connection to file
	tweets_file = open(tweets_data_path, "r")

	# Read in tweets and store in list: tweets_data
	for line in tweets_file:
	tweet = json.loads(line)
	tweets_data.append(tweet)

	# Close connection to file
	tweets_file.close()

	# Print the keys of the first tweet dict
	print(tweets_data[0].keys())

	# -_-_- iv) Twitter data to Dataframe

	# Import package
	import pandas as pd

	# Build DataFrame of tweet texts and languages
	df = pd.DataFrame(tweets_data, columns=['text', 'lang'])

	# Print head of DataFrame
	print(df.head())

	# -_-_- v) Twitter Analysis


	# Initialize list to store tweet counts
	[clinton, trump, sanders, cruz] = [0, 0, 0, 0]

	import re

	def word_in_text(word, tweet):
	word = word.lower()
	text = tweet.lower()
	match = re.search(word, tweet)

	if match:
	return True
	return False

	# Iterate through df, counting the number of tweets in which
	# each candidate is mentioned
	for index, row in df.iterrows():
	clinton += word_in_text('clinton', row['text'])
	trump += word_in_text('trump', row['text'])
	sanders += word_in_text('sanders', row['text'])
	cruz += word_in_text('cruz', row['text'])

	# -_-_- v) Plot Twitter Data

	# Import packages
	import seaborn as sns
	import matplotlib.pyplot as plt

	# Set seaborn style
	sns.set(color_codes=True)

	# Create a list of labels:cd
	cd = ['clinton', 'trump', 'sanders', 'cruz']

	# Plot histogram
	ax = sns.barplot(cd, [clinton, trump, sanders, cruz])
	ax.set(ylabel="count")
	plt.show()
	# Import numpy
	import numpy as np

	# Assign the filename: file
	file = 'digits_header.txt'

	# Load the data: data
	'''
	for example, you can use ',' and '\t' for comma-delimited and tab-delimited respectively;
	skiprows allows you to specify how many rows (not indices) you wish to skip;
	usecols takes a list of the indices of the columns you wish to keep.
	'''
	data = np.loadtxt(file, delimiter='\t', skiprows=1, usecols=[0,2])

	# Print data
	print(data)

	# Import data as floats and skip the first row: data_float
	data_float = np.loadtxt(file, delimiter='\t', dtype=float, skiprows=1)

	# Print the 10th element of data_float
	print(data_float[9])

	# Plot a scatterplot of the data
	plt.scatter(data_float[:, 0], data_float[:, 1])
	plt.xlabel('time (min.)')
	plt.ylabel('percentage of larvae')
	plt.show()

	# Import data containing mixed datatypes (i.e. one column containing strings and another floats)
	data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None)
	# ================== Loading a pickled file ================== #
	# Import pickle package
	import pickle

	# Open pickle file and load data: d
	with open('data.pkl', "rb") as file:
	d = pickle.load(file)

	# Print d
	print(d)

	# Print datatype of d
	print(type(d))

	# ================== Loading sheets in Excel Files ================== #
	# Import pandas to import excel spreadsheets
	import pandas as pd

	# Assign spreadsheet filename: file
	file = 'battledeath.xlsx'

	# Load spreadsheet: xl
	xl = pd.ExcelFile(file)

	# Print sheet names
	print(xl.sheet_names)

	# ----------- A) Parse sheets into dataframes ------- #
	# Parse the first sheet and rename the columns: df1
	df1 = xl.parse(0, skiprows=[0], names=['Country', 'AAM due to War (2002)'])

	# Print the head of the DataFrame df1
	print(df1.head())

	# Parse the first column of the second sheet and rename the column: df2
	df2 = xl.parse(1, parse_cols=[0], skiprows=[0], names=['Country'])

	# Print the head of the DataFrame df2
	print(df2.head())

	# ================== Importing SAS/STATA Files ================== #
	# Import sas7bdat package
	from sas7bdat import SAS7BDAT

	# Save file to a DataFrame: df_sas
	with SAS7BDAT('sales.sas7bdat') as file:
	df_sas = file.to_data_frame()

	# Print head of DataFrame
	print(df_sas.head())

	# Plot histogram of DataFrame features (pandas and pyplot already imported)
	pd.DataFrame.hist(df_sas[['P']])
	plt.ylabel('count')
	plt.show()

	# Load Stata file into a pandas DataFrame: df
	df = pd.read_stata('disarea.dta')

	# Print the head of the DataFrame df
	print(df.head())

	# Plot histogram of one column of the DataFrame
	pd.DataFrame.hist(df[['disa10']])
	plt.xlabel('Extent of disease')
	plt.ylabel('Number of coutries')
	plt.show()

	# ================== Importing HDF5 Files ================== #
	# Import packages
	import numpy as np
	import h5py

	# Assign filename: file
	file = 'LIGO_data.hdf5'

	# Load file: data
	data = h5py.File(file, 'r')

	# Print the datatype of the loaded file
	print(type(data))

	# Print the keys of the file
	for key in data.keys():
	print(key)

	# ----------- A) Extracting data from HDF5 File ------- #
	# Get the HDF5 group: group
	group = data['strain']

	# Set variable equal to time series data: strain
	strain = data['strain']['Strain'].value

	# Set number of time points to sample: num_samples
	num_samples = 10000

	# Set time vector
	time = np.arange(0, 1, 1/num_samples)

	# Plot data
	plt.plot(time, strain[:num_samples])
	plt.xlabel('GPS Time (s)')
	plt.ylabel('strain')
	plt.show()

	# ================== Importing MATLAB Files ================== #
	# Import package
	import scipy.io

	# Load MATLAB file: mat
	mat = scipy.io.loadmat('albeck_gene_expression.mat')

	# Print the datatype type of mat
	print(type(mat))
	# Assign the filename: file
	file = 'digits.csv'

	# Read the first 5 rows of the file into a DataFrame: data
	data = pd.read_csv(file, nrows=5, header=None)

	# Build a numpy array from the DataFrame: data_array
	data_array = data.values

	# Print the datatype of data_array to the shell
	print(type(data_array))

	# ================== Customizing Pandas Import ================== #
	# Import file: data (handle comments, empty lines, missing values)
	data = pd.read_csv(file, sep='\t', comment='#', na_values='Nothing')

	# Print the head of the DataFrame
	print(data.head())

	# Plot 'Age' variable in a histogram
	pd.DataFrame.hist(data[['Age']])
	plt.xlabel('Age (years)')
	plt.ylabel('count')
	plt.show()
	# ================== SQL Query Databases ================== #
	# Import packages
	from sqlalchemy import create_engine
	import pandas as pd

	# Create engine: engine
	engine = create_engine('sqlite:///Chinook.sqlite')

	# Open engine connection: con
	con = engine.connect()

	# Perform query: rs
	rs = con.execute('SELECT * FROM Album')

	# Save results of the query to DataFrame: df
	df = pd.DataFrame(rs.fetchall())

	# Close connection
	con.close()

	# Print head of DataFrame df
	print(df.head())

	# ----------- A) Customize SQl query results ------- #

	# Create engine: engine
	engine = create_engine('sqlite:///Chinook.sqlite')

	# Open engine in context manager
	with engine.connect() as con:
	rs = con.execute('SELECT * FROM Employee ORDER BY BirthDate')
	df = pd.DataFrame(rs.fetchall())

	# Set the DataFrame's column names
	df.columns = rs.keys()

	# Print head of DataFrame
	print(df.head())

	# ----------- B) Write results of SQL Query directly to DataFrame ------- #
	# Create engine: engine
	engine = create_engine('sqlite:///Chinook.sqlite')

	# Execute query and store records in DataFrame: df
	df = pd.read_sql_query("SELECT * FROM Album", engine)

	# Print head of DataFrame
	print(df.head())

	# ----------- C) Relationships between tables: INNER JOIN ------- #

	# Open engine in context manager
	# Perform query and save results to DataFrame: df
	with engine.connect() as con:
	rs = con.execute("SELECT Title, Name FROM Album INNER JOIN Artist on Album.ArtistID = Artist.ArtistID")
	df = pd.DataFrame(rs.fetchall())
	df.columns = rs.keys()

	# Print head of DataFrame df
	print(df.head())
	# ================== Files from Web ================== #
	# ----------- A) Flat Files ------- #
	# Import packages
	import matplotlib.pyplot as plt
	import pandas as pd

	# Assign url of file: url
	url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'

	# Read file into a DataFrame: df
	df = pd.read_csv(url, sep=';')

	# ----------- B) NonFlat Files ------- #
	# Assign url of file: url
	url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'

	# Read in all sheets of Excel file: xl
	xl = pd.read_excel(url, sheetname=None)

	# Print the sheetnames to the shell
	print(xl.keys())

	# Print the head of the first sheet (using its name, NOT its index)
	print(xl['1700'].head())

	# ================== HTTP Requests ================== #
	# Import package
	import requests

	# Specify the url: url
	url = "http://www.datacamp.com/teach/documentation"

	# Packages the request, send the request and catch the response: r
	r = requests.get(url)

	# Extract the response: text
	html_doc = r.text

	# Print the html
	print(html_doc)

	# ----------- A) Parsing HTML with BeautifulSoup ------- #
	# create a BeautifulSoup object from the HTML: soup
	soup = BeautifulSoup(html_doc)

	# Print the title of Guido's webpage
	print(soup.title)

	# Find all 'a' tags (which define hyperlinks): a_tags
	a_tags = soup.find_all('a')

	# Print the URLs to the shell
	for link in a_tags:
	print(link.get('href'))
	What I learned:
	- importing text files and flat files
	- importing files in other formats
	- writing SQL queries
	- Getting data from relational databases
	- Pulling data from the web
	- Pulling data from APIs
	class MyStreamListener(tweepy.StreamListener):
	def __init__(self, api=None):
	super(MyStreamListener, self).__init__()
	self.num_tweets = 0
	self.file = open("tweets.txt", "w")

	def on_status(self, status):
	tweet = status._json
	self.file.write( json.dumps(tweet) + '\n' )
	self.num_tweets += 1
	if self.num_tweets < 100:
	return True
	else:
	return False
	self.file.close()

	def on_error(self, status):
	print(status)