Created
May 7, 2020 15:09
-
-
Save VAD3R-95/07c35394d3fcc28d27b76a335c98611b to your computer and use it in GitHub Desktop.
Scrappers, miners, api extractors (news, ndtv, twitter, api)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as Bs | |
import requests as req | |
import calendar | |
import base64 as bs64 | |
# if 1000 news for a topic and each page contains 15 then approx around 65/66 pages :) | |
def date_convert(date_str): | |
#Thursday April 30, 2020 to YYYY-MM-DD | |
dow, moy, date, year = date_str.split() | |
mon_year = dict(zip(calendar.month_name,range(13))) | |
month = mon_year[moy] | |
formatted_date = year+'-'+str(month)+'-'+date.replace(',','') | |
return formatted_date | |
def parser(): | |
soup = Bs(r.content, 'html.parser') | |
new_data = dict.fromkeys(range(15), []) | |
indx = -1 | |
for a_tag in soup.findAll("p", {"class": "header fbld"}): | |
for link in a_tag.findAll('a', href=True): | |
indx += 1 | |
new_data[indx] = link['href'] | |
#print(link['href']) | |
#print(new_data) | |
indx = -1 | |
my_para = soup.findAll("p", {"class": "intro"}) | |
for para_text in my_para: | |
paragraph = para_text.getText().strip() | |
para_enc = bs64.b64encode(paragraph.encode('utf-8')) | |
raw_para = para_enc.decode('utf-8') | |
indx += 1 | |
para_val = new_data[indx] + '%' + raw_para | |
new_data[indx] = para_val | |
#print(raw_para) | |
#print(new_data) | |
indx = -1 | |
my_date = soup.findAll("p", {"class":"list_dateline"}) | |
for date_text in my_date: | |
indx += 1 | |
date_str = date_text.getText() | |
if date_str.find('|') == -1: # not there - | | |
final_date = date_convert(date_str) | |
else: | |
w_date = " " | |
word_date = w_date.join(date_str.rsplit('|', 1)[1:]) # go reverse :) | |
final_date = date_convert(word_date) | |
#print(final_date) | |
date_val = new_data[indx] + '%' + final_date | |
new_data[indx] = date_val | |
#print(new_data) | |
return new_data | |
def get_values_to_file(nest_dict): | |
with open('Tesla-news.csv','a') as f: | |
# recursively loop on the dict/dicts to get each value | |
for k, val in nest_dict.items(): | |
if type(val) is dict: | |
get_values_to_file(val) | |
else: | |
#print(k,":",val) | |
csv_file = val.replace("%",",") | |
csv_file = 'TA' + ',' + csv_file | |
print(csv_file) | |
f.write(csv_file+"\n") | |
f.close() | |
query = 'Tesla' | |
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' | |
for page in range(1, 61): | |
new_dict = {} | |
url = 'https://www.ndtv.com/page/topic-load-more?type=news&page='+str(page)+'&query='+query | |
r = req.get(url,headers={'User-Agent': user_agent}) | |
if r.status_code == 200: | |
page_num = 'page_' + str(page) | |
new_dict[page_num] = parser() | |
#print(new_dict) | |
get_values_to_file(new_dict) | |
# https://lerner.co.il/2019/05/12/python-dicts-and-memory-usage/ | |
# https://stackabuse.com/encoding-and-decoding-base64-strings-in-python/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from newsapi import NewsApiClient | |
import base64 as bs64 | |
import re | |
TAG_RE = re.compile(r'<[^>]+>') | |
def remove_tags(text): | |
return TAG_RE.sub('', text) | |
# Init | |
newsapi = NewsApiClient(api_key='') | |
# /v2/everything | |
all_articles = newsapi.get_everything(q='Tesla', | |
from_param='2020-04-05', | |
to='2020-05-01', | |
language='en', | |
sort_by='relevancy') | |
with open('Tesla-news-api.csv','a') as f: | |
for i in all_articles['articles']: | |
news_html = i['description'] | |
news = remove_tags(news_html) | |
#print(news) | |
base64_news = bs64.b64encode(news.encode('utf-8')) | |
raw_news = base64_news.decode('utf-8') | |
datetime = i['publishedAt'] | |
date, time = datetime.split('T') | |
final_date_format = date.split()[0] | |
final_date = "'"+final_date_format | |
name = i['source']['name'] | |
final_name = name.split()[0] | |
csv_file = 'TA'+','+raw_news+','+final_date+','+final_name+','+datetime | |
print(csv_file) | |
f.write(csv_file+"\n") | |
f.close() | |
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tweepy import Stream | |
from tweepy import OAuthHandler | |
from tweepy.streaming import StreamListener | |
import json | |
import base64 as bs64 | |
import calendar | |
#consumer key, consumer secret, access token, access secret. | |
ckey="" | |
csecret="" | |
atoken="" | |
asecret="" | |
def deEmojify(inputString): | |
return inputString.encode('ascii', 'ignore').decode('ascii') | |
def date_convert(date_str): | |
#Sun May 03 20:56:11 +0000 2020 | |
doy, moy, date, time, zone, year = date_str.split() | |
mon_year = dict(zip(calendar.month_name,range(13))) | |
month = mon_year[moy] | |
formatted_date = year+'-'+str(month)+'-'+date | |
return formatted_date | |
class listener(StreamListener): | |
def __init__(self): | |
self.tweet_count = 0 | |
def on_data(self, data): | |
self.tweet_count += 1 | |
if (self.tweet_count <= 500): # 500 tweets | |
all_data = json.loads(data) | |
tweet_id = all_data[u'id'] | |
tid = str(tweet_id) | |
datetime = all_data[u'created_at'] | |
date_time = datetime .replace(" ", "") | |
date = date_convert(datetime) | |
raw_user = all_data[u'user'][u'name'] | |
user = deEmojify(raw_user) | |
enc_tweet = all_data[u'text'] | |
tweet = deEmojify(enc_tweet) | |
tweet_enc = bs64.b64encode(tweet.encode('utf-8')) | |
raw_tweet = tweet_enc.decode('utf-8') | |
print('{},{},{},{},{},{}'.format('TA',tid,date,user,raw_tweet,date_time)) | |
with open('Tesla-tweet.csv','a') as f: | |
csv_file = 'TA'+','+"'"+str(tid)+','+date+','+user+','+raw_tweet+','+date_time | |
f.write(csv_file+'\n') | |
return True | |
else: | |
f.close() | |
return False | |
def on_error(self, status): | |
print(status) | |
auth = OAuthHandler(ckey, csecret) | |
auth.set_access_token(atoken, asecret) | |
twitterStream = Stream(auth, listener()) | |
twitterStream.filter(track=["Tesla"],languages=["en"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment