Skip to content

Instantly share code, notes, and snippets.

@ajinux
Created April 25, 2017 18:44
Show Gist options
  • Save ajinux/2912637c6a3455abcc3b00252d749012 to your computer and use it in GitHub Desktop.
Save ajinux/2912637c6a3455abcc3b00252d749012 to your computer and use it in GitHub Desktop.
AIconcalve moneycontrol crawler
import requests
from bs4 import BeautifulSoup
import csv
count_sub = 0
count_main = 0
def subcrawl(suburl):
suburl = "http://moneycontrol.com" + suburl
print "\ncrawl url :", suburl
resp = requests.get(suburl)
soup = BeautifulSoup(resp.content, "lxml")
g_data = soup.find_all("article", {"class": "article_box"})
# print "\ng_data :", g_data
rtype = 0
if len(g_data) == 0:
g_data = soup.find_all("div", {"class": "article_box"})
rtype = 1
for data in g_data:
global count_sub
count_sub = count_sub + 1
# print "\n\n Sub url crawl : "+data.get_text().replace("\n", "")
return [data.get_text().replace("\n", ""), rtype]
return ["NO date feteched", rtype]
def maincrawl():
url = "http://www.moneycontrol.com/stocks/company_info/stock_news.php?sc_id=TEL&scat=&pageno=10&next=0&durationType=M&Year=&duration=3&news_type="
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "lxml")
csvfile = open('eggs.csv', 'a+')
writer = csv.writer(csvfile)
g_data = soup.find_all("div", {"class": "MT15 PT10 PB10"})
for data in g_data:
print "\n\n\n\ndate :", data.find_all("p", {"class": "PT3 a_10dgry"})[0].get_text()
meta = data.find_all("p", {"class": "PT3 a_10dgry"})[0].get_text().split("|")
time, date, source = [x.strip() for x in meta]
source = source.split(":")[1].strip()
global count_main
count_main = count_main + 1
sub_url = data.find_all("a")[0].get("href").encode('utf-8')
temp = subcrawl(data.find_all("a")[0].get("href"))
print temp
source_data, rtype = temp
writer.writerow((time, date, source, "http://moneycontrol.com" + sub_url, source_data.encode('utf-8'), rtype))
csvfile.close()
maincrawl()
print "\nMain :", count_main
print "\nSub :", count_sub
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment