Skip to content

Instantly share code, notes, and snippets.

@VAD3R-95
Created May 7, 2020 15:09
Show Gist options
  • Save VAD3R-95/07c35394d3fcc28d27b76a335c98611b to your computer and use it in GitHub Desktop.
Save VAD3R-95/07c35394d3fcc28d27b76a335c98611b to your computer and use it in GitHub Desktop.
Scrappers, miners, api extractors (news, ndtv, twitter, api)
from bs4 import BeautifulSoup as Bs
import requests as req
import calendar
import base64 as bs64
# if 1000 news for a topic and each page contains 15 then approx around 65/66 pages :)
def date_convert(date_str):
#Thursday April 30, 2020 to YYYY-MM-DD
dow, moy, date, year = date_str.split()
mon_year = dict(zip(calendar.month_name,range(13)))
month = mon_year[moy]
formatted_date = year+'-'+str(month)+'-'+date.replace(',','')
return formatted_date
def parser():
soup = Bs(r.content, 'html.parser')
new_data = dict.fromkeys(range(15), [])
indx = -1
for a_tag in soup.findAll("p", {"class": "header fbld"}):
for link in a_tag.findAll('a', href=True):
indx += 1
new_data[indx] = link['href']
#print(link['href'])
#print(new_data)
indx = -1
my_para = soup.findAll("p", {"class": "intro"})
for para_text in my_para:
paragraph = para_text.getText().strip()
para_enc = bs64.b64encode(paragraph.encode('utf-8'))
raw_para = para_enc.decode('utf-8')
indx += 1
para_val = new_data[indx] + '%' + raw_para
new_data[indx] = para_val
#print(raw_para)
#print(new_data)
indx = -1
my_date = soup.findAll("p", {"class":"list_dateline"})
for date_text in my_date:
indx += 1
date_str = date_text.getText()
if date_str.find('|') == -1: # not there - |
final_date = date_convert(date_str)
else:
w_date = " "
word_date = w_date.join(date_str.rsplit('|', 1)[1:]) # go reverse :)
final_date = date_convert(word_date)
#print(final_date)
date_val = new_data[indx] + '%' + final_date
new_data[indx] = date_val
#print(new_data)
return new_data
def get_values_to_file(nest_dict):
with open('Tesla-news.csv','a') as f:
# recursively loop on the dict/dicts to get each value
for k, val in nest_dict.items():
if type(val) is dict:
get_values_to_file(val)
else:
#print(k,":",val)
csv_file = val.replace("%",",")
csv_file = 'TA' + ',' + csv_file
print(csv_file)
f.write(csv_file+"\n")
f.close()
query = 'Tesla'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
for page in range(1, 61):
new_dict = {}
url = 'https://www.ndtv.com/page/topic-load-more?type=news&page='+str(page)+'&query='+query
r = req.get(url,headers={'User-Agent': user_agent})
if r.status_code == 200:
page_num = 'page_' + str(page)
new_dict[page_num] = parser()
#print(new_dict)
get_values_to_file(new_dict)
# https://lerner.co.il/2019/05/12/python-dicts-and-memory-usage/
# https://stackabuse.com/encoding-and-decoding-base64-strings-in-python/
from newsapi import NewsApiClient
import base64 as bs64
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# Init
newsapi = NewsApiClient(api_key='')
# /v2/everything
all_articles = newsapi.get_everything(q='Tesla',
from_param='2020-04-05',
to='2020-05-01',
language='en',
sort_by='relevancy')
with open('Tesla-news-api.csv','a') as f:
for i in all_articles['articles']:
news_html = i['description']
news = remove_tags(news_html)
#print(news)
base64_news = bs64.b64encode(news.encode('utf-8'))
raw_news = base64_news.decode('utf-8')
datetime = i['publishedAt']
date, time = datetime.split('T')
final_date_format = date.split()[0]
final_date = "'"+final_date_format
name = i['source']['name']
final_name = name.split()[0]
csv_file = 'TA'+','+raw_news+','+final_date+','+final_name+','+datetime
print(csv_file)
f.write(csv_file+"\n")
f.close()
# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import base64 as bs64
import calendar
#consumer key, consumer secret, access token, access secret.
ckey=""
csecret=""
atoken=""
asecret=""
def deEmojify(inputString):
return inputString.encode('ascii', 'ignore').decode('ascii')
def date_convert(date_str):
#Sun May 03 20:56:11 +0000 2020
doy, moy, date, time, zone, year = date_str.split()
mon_year = dict(zip(calendar.month_name,range(13)))
month = mon_year[moy]
formatted_date = year+'-'+str(month)+'-'+date
return formatted_date
class listener(StreamListener):
def __init__(self):
self.tweet_count = 0
def on_data(self, data):
self.tweet_count += 1
if (self.tweet_count <= 500): # 500 tweets
all_data = json.loads(data)
tweet_id = all_data[u'id']
tid = str(tweet_id)
datetime = all_data[u'created_at']
date_time = datetime .replace(" ", "")
date = date_convert(datetime)
raw_user = all_data[u'user'][u'name']
user = deEmojify(raw_user)
enc_tweet = all_data[u'text']
tweet = deEmojify(enc_tweet)
tweet_enc = bs64.b64encode(tweet.encode('utf-8'))
raw_tweet = tweet_enc.decode('utf-8')
print('{},{},{},{},{},{}'.format('TA',tid,date,user,raw_tweet,date_time))
with open('Tesla-tweet.csv','a') as f:
csv_file = 'TA'+','+"'"+str(tid)+','+date+','+user+','+raw_tweet+','+date_time
f.write(csv_file+'\n')
return True
else:
f.close()
return False
def on_error(self, status):
print(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
twitterStream = Stream(auth, listener())
twitterStream.filter(track=["Tesla"],languages=["en"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment