Skip to content

Instantly share code, notes, and snippets.

@EHDEV
Last active August 29, 2015 14:19
Show Gist options
  • Save EHDEV/724a90302e514eeda82f to your computer and use it in GitHub Desktop.
Save EHDEV/724a90302e514eeda82f to your computer and use it in GitHub Desktop.
import pandas as p
import requests as r
from StringIO import StringIO
import datetime
from bs4 import BeautifulSoup
import re
from alchemyapi import AlchemyAPI
import json
import numpy as np
from flask import request
from os import path
import csv
#Code to download stock data from Yahoo, converts it into a Pandas Dataframe returns it to the calling function
def get_quotes(start_date='2015-04-01', end_date='2015-04-21', ticker='AAPL'):
d = r.get('http://ichart.yahoo.com/table.csv?s={0}&a=01&b=04&c=2015&d=21&e=04&f=2015'.format(ticker))
dat = d.content
csd = str(dat).strip("b'").encode('utf-8')
data = StringIO(csd)
df = p.read_csv(data, sep=',')
df = df.set_index('Date')
return df.to_json()
# The methods below dynamically construct a url mimic the resulting advanced search url's of yahoo and twitter. By doing this, I was able to go around the restrictions of Twitter api and avoid using Yahoo's api. By utilizing this and the BeautifulSoup library, I was able to download historical tweets and news.
def construct_search_url_yh(start_date, end_date=datetime.date.today(), ticker='AAPL'):
url_dict = {}
date_list = [end_date.date() - datetime.timedelta(days=x) for x in range(1, (end_date - start_date).days + 1)]
for d in date_list:
g_url = "http://finance.yahoo.com/q/h?s={0}&t={1}".format(
ticker, str(d))
url_dict[str(d)] = g_url
return url_dict
def construct_search_url_tw(search_term, start_date, end_date=datetime.date.today()):
url_dict = {}
date_list = [end_date - datetime.timedelta(days=x) for x in range(1, (end_date - start_date).days + 1)]
end_date = start_date + datetime.timedelta(days=1)
for i, d in enumerate(date_list):
g_url = "https://twitter.com/search?q={0}%20from%3Ayahoofinance%20since%3A{1}%20until%3A{2}&src=typd".format(
search_term, start_date, end_date)
start_date = end_date
end_date = start_date + datetime.timedelta(days=1)
url_dict[str(d)] = g_url
return url_dict
# Here is the function to get and parse historical tweets using the url generated above.
def collect_historical_tweets(url):
req = r.get(url)
beatw = BeautifulSoup(req.text)
twits_list = []
for pa in beatw.find_all('p'):
# print pa.get('class', None)
if pa.get('class', [''])[0] == "js-tweet-text":
twits_list += [str(pa)]
return twits_list
# Below is the code to scrape the historical news html obtained from Yahoo to identify links to articles that were posted on the requested day.
def news_scrape(rurl):
links = []
rss = r.get(rurl)
soup = BeautifulSoup(rss.text)
for l in soup.find_all('a'):
# print(l)
lhr = l['href']
mtch = re.search("\*http://.+?\"", lhr)
try:
url = mtch.group()
links += [url[1:-1]]
except AttributeError:
mtch = re.search('http:\/\/finance.yahoo.com\/news.+\.html', lhr)
try:
url = mtch.group()
links += [url]
except AttributeError:
continue
print("Regex Error, " + str(l))
continue
print (links)
return links
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment