Created
December 17, 2014 09:17
-
-
Save nhat2008/2af3c41b85c733a4b24c to your computer and use it in GitHub Desktop.
Project 2k1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project version="4"> | |
<component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" /> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project version="4"> | |
<component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.3 (/usr/bin/python2.7)" project-jdk-type="Python SDK" /> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project version="4"> | |
<component name="ProjectModuleManager"> | |
<modules> | |
<module fileurl="file://$PROJECT_DIR$/.idea/untitled1.iml" filepath="$PROJECT_DIR$/.idea/untitled1.iml" /> | |
</modules> | |
</component> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<module type="PYTHON_MODULE" version="4"> | |
<component name="NewModuleRootManager"> | |
<content url="file://$MODULE_DIR$" /> | |
<orderEntry type="jdk" jdkName="Python 2.7.3 (/usr/bin/python2.7)" jdkType="Python SDK" /> | |
<orderEntry type="sourceFolder" forTests="false" /> | |
</component> | |
</module> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<component name="DependencyValidationManager"> | |
<state> | |
<option name="SKIP_IMPORT_STATEMENTS" value="false" /> | |
</state> | |
</component> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project version="4"> | |
<component name="VcsDirectoryMappings"> | |
<mapping directory="" vcs="" /> | |
</component> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project version="4"> | |
<component name="ChangeListManager"> | |
<option name="TRACKING_ENABLED" value="true" /> | |
<option name="SHOW_DIALOG" value="false" /> | |
<option name="HIGHLIGHT_CONFLICTS" value="true" /> | |
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" /> | |
<option name="LAST_RESOLUTION" value="IGNORE" /> | |
</component> | |
<component name="ChangesViewManager" flattened_view="true" show_ignored="false" /> | |
<component name="CreatePatchCommitExecutor"> | |
<option name="PATCH_PATH" value="" /> | |
</component> | |
<component name="DaemonCodeAnalyzer"> | |
<disable_hints /> | |
</component> | |
<component name="ProjectLevelVcsManager" settingsEditedManually="false"> | |
<OptionsSetting value="true" id="Add" /> | |
<OptionsSetting value="true" id="Remove" /> | |
<OptionsSetting value="true" id="Checkout" /> | |
<OptionsSetting value="true" id="Update" /> | |
<OptionsSetting value="true" id="Status" /> | |
<OptionsSetting value="true" id="Edit" /> | |
<ConfirmationsSetting value="0" id="Add" /> | |
<ConfirmationsSetting value="0" id="Remove" /> | |
</component> | |
<component name="RunManager"> | |
<list size="0" /> | |
</component> | |
<component name="ShelveChangesManager" show_recycled="false" /> | |
<component name="TaskManager"> | |
<task active="true" id="Default" summary="Default task"> | |
<option name="number" value="Default" /> | |
</task> | |
<servers /> | |
</component> | |
<component name="VcsContentAnnotationSettings"> | |
<option name="myLimit" value="2678400000" /> | |
</component> | |
<component name="VcsManagerConfiguration"> | |
<option name="myTodoPanelSettings"> | |
<TodoPanelSettings /> | |
</option> | |
</component> | |
<component name="XDebuggerManager"> | |
<breakpoint-manager /> | |
<watches-manager /> | |
</component> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nhat ,nhat,nhat-sentifi,17.12.2014 12:10,file:///home/nhat/.config/libreoffice/3; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
__author__ = 'khongcoten' | |
import traceback | |
import amqp | |
from amqp import Message | |
from kombu import connections, BrokerConnection | |
from crawler2 import Spider2 | |
import json | |
class Process(): | |
crawler = 0 | |
crawler2 = 0 | |
def __init__(self): | |
self.crawler2 = Spider2() | |
def process2(self, name_origin, name_BW, isin, ticker, list_date, sector, industry): | |
return self.crawler2.parse(name_origin=name_origin, name_BW=name_BW, isin = isin, ticker = ticker, list_date = list_date, sector = sector, industry = industry) | |
if __name__ == "__main__": | |
print '============================================================' | |
print 'Running app.py' | |
count = 0 | |
broker_url = 'amqp://worker:caydemcayngay@dev.ssh.sentifi.com:5672' | |
connection = connections[BrokerConnection(broker_url)].acquire(block=True) | |
channel = connection.channel() | |
queue_name = 'Test_Queues' | |
no_ack = False | |
queue_out = 'Test_Queues_E_Exchange' | |
process = Process() | |
live = True | |
while live: | |
response = channel.basic_get(queue_name, no_ack=no_ack) | |
if response is None: | |
time.sleep(5) | |
live = False | |
continue | |
message = Message() | |
message.body = response.body | |
message.tag = response | |
print 'Number ',str(count) | |
print 'Message',message.body | |
count = count + 1 | |
try: | |
t = json.loads(message.body) | |
isin = None | |
name_origin = t['new_Company'] | |
name_BW = t['new_name_BW'] | |
ticker = None | |
list_date = None | |
sector = None | |
industry = None | |
country = None | |
if 'new_symbol' in t and t['new_symbol'] : | |
ticker = t['new_symbol'] | |
if 'ListDate' in t and t['ListDate']: | |
list_date = t.get('ListDate', {}) | |
if 'Sector' in t and t['Sector']: | |
sector = t.get('Sector', {}) | |
if 'ISIN' in t and t['ISIN'] : | |
isin = t.get('ISIN', {}) | |
if 'Industry' in t and t['Industry']: | |
industry = t.get('Industry', {}) | |
if 'Country' in t and t['Country']: | |
country = t.get('Country', {} ) | |
if ticker: | |
flag = process.process2(name_origin=name_origin, name_BW=name_BW, isin = isin, ticker = ticker, list_date = list_date, sector = sector, industry= industry) | |
else: | |
flag = False | |
if not flag: | |
m = amqp.Message(message.body) | |
channel.basic_publish(m, exchange=queue_out) | |
except Exception, e: | |
m = amqp.Message(message.body) | |
channel.basic_publish(m, exchange=queue_out) | |
channel.basic_ack(message.tag.delivery_tag) | |
print 'Finishing app.py' | |
print '============================================================' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
from pymongo import MongoClient,ReadPreference | |
import string | |
import HTMLParser | |
class DmozSpider(BaseSpider): | |
alphabet = list(string.lowercase) | |
name = "bloomberg_detail" | |
allowed_domains = ["http://investing.businessweek.com/"] | |
start_urls = [] | |
MAIN_DB_HOST1 = 'localhost' | |
MAIN_DB_PORT = 27017 | |
REL_DB = 'TRAIN' | |
REL_COLL = 'All_Companies' | |
client = MongoClient(MAIN_DB_HOST1, MAIN_DB_PORT) | |
read_preference = ReadPreference.SECONDARY | |
rel_coll = client[REL_DB][REL_COLL] | |
def __init__(self): | |
print '============================================================' | |
print 'Running bloomberg_detail.py' | |
for r in self.rel_coll.find({"sector_BW" : {"$exists" : False }}): | |
url = r['url_BW'] | |
self.start_urls.append(url) | |
num = 0 | |
def parse(self, response): | |
html_parser = HTMLParser.HTMLParser() | |
sel = HtmlXPathSelector(response) | |
url = str(response.url) | |
print url | |
sector = sel.select('//*[@id="columnLeft"]/div/div[2]/div[1]/div[1]/div[1]/h2/a/text()').extract() | |
if sector: | |
sector = html_parser.unescape(' '.join(sector[0].splitlines()).strip()) | |
sector = sector.lower().replace('sector', '').strip() | |
industry = sel.select('//*[@id="columnLeft"]/div/div[2]/div[1]/div[1]/div[3]/h2/a/text()').extract() | |
if industry: | |
industry = html_parser.unescape(' '.join(industry[0].splitlines()).strip()) | |
industry = industry.lower().replace('industry', '').strip() | |
name = sel.select('//*[@id="columnLeft"]/div/h2[1]/span[1]/text()').extract() | |
name = html_parser.unescape(' '.join(name[0].splitlines()).strip()) | |
print 'XXXXX' | |
print name | |
print sector | |
print industry | |
print 'YYYYY' | |
if not sector: | |
sector = None | |
if not industry: | |
industry = None | |
self.rel_coll.update({"url_BW" : url}, | |
{"$set" : { | |
"sector_BW" : sector, | |
"industry_BW" : industry, | |
"name_BW" : name | |
}}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml.html import html_parser | |
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
from pymongo import MongoClient,ReadPreference | |
import string | |
import HTMLParser | |
class DmozSpider(BaseSpider): | |
alphabet = list(string.lowercase) | |
name = "bloomberg_detail2" | |
allowed_domains = ["http://investing.businessweek.com/"] | |
start_urls = [] | |
MAIN_DB_HOST = 'localhost' | |
MAIN_DB_PORT = 27017 | |
REL_DB = 'TRAIN' | |
REL_COLL = 'All_Companies' | |
client = MongoClient(MAIN_DB_HOST, MAIN_DB_PORT) | |
read_preference = ReadPreference.SECONDARY | |
rel_coll = client[REL_DB][REL_COLL] | |
def __init__(self): | |
print '============================================================' | |
print 'Running bloomberg_detail2.py' | |
for r in self.rel_coll.find(): | |
url = r['url_BW'] | |
url = url.replace(r'.asp', r"_article.asp") | |
self.start_urls.append(url) | |
num = 0 | |
def parse(self, response): | |
html_parser = HTMLParser.HTMLParser() | |
sel = HtmlXPathSelector(response) | |
try: | |
urll = response.request.meta['redirect_urls'] | |
url = urll[0].replace(r'_article.asp', r'.asp') | |
except KeyError: | |
url = response.url | |
url = url.replace(r'_article.asp', r'.asp') | |
contact = "" | |
address = "" | |
contacts = sel.select('//*[@id="subColElement"]/div/*') | |
for c in contacts: | |
temp = c.select("text()").extract() | |
if len(temp) > 0: | |
temp = html_parser.unescape(' '.join(temp[0].splitlines()).strip()) | |
temp += "\n" | |
address += temp | |
phone = sel.select('//*[@id="subColElement"]/p[1]/text()').extract() | |
if len(phone) >= 1: | |
phone = html_parser.unescape(' '.join(phone[0].splitlines()).strip()) | |
number = sel.select('//*[@id="subColElement"]/p[1]/span/text()').extract() | |
print len(number) | |
if (len(number) >= 1) : | |
number = html_parser.unescape(' '.join(number[0].splitlines()).strip()) | |
number = " " + number | |
contact += phone | |
contact += number | |
contact += "\n" | |
fax = sel.select('//*[@id="subColElement"]/p[2]/text()').extract() | |
if len(fax) >= 1: | |
fax = html_parser.unescape(' '.join(fax[0].splitlines()).strip()) | |
contact += fax | |
website = sel.select('//*[@id="subColElement"]/span/a/text()').extract() | |
if len(website) >= 1: | |
website = html_parser.unescape(' '.join(website[0].splitlines()).strip()) | |
else: | |
website = "" | |
description = sel.select('//*[@id="article"]/p/text()').extract() | |
if len(description) >= 1: | |
description = html_parser.unescape(' '.join(description[0].splitlines()).strip()) | |
else: | |
description = "" | |
print url | |
self.rel_coll.update({"url_BW" : url}, | |
{"$set" : { | |
'contact_BW' : contact, | |
'description_BW' : description, | |
'website_BW' : website, | |
'address_BW' : address | |
} | |
}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml.html import html_parser | |
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
from pymongo import MongoClient,ReadPreference | |
import string | |
import HTMLParser | |
class DmozSpider(BaseSpider): | |
alphabet = list(string.lowercase) | |
name = "bloomberg_people" | |
allowed_domains = ["http://investing.businessweek.com/"] | |
start_urls = [] | |
MAIN_DB_HOST = 'localhost' | |
MAIN_DB_PORT = 27017 | |
REL_DB = 'TRAIN' | |
REL_COLL = 'All_Companies' | |
REL_DB2 = 'TRAIN' | |
REL_COLL2 = 'People' | |
client = MongoClient(MAIN_DB_HOST, MAIN_DB_PORT) | |
read_preference = ReadPreference.SECONDARY | |
rel_coll = client[REL_DB][REL_COLL] | |
rel_coll2 = client[REL_DB2][REL_COLL2] | |
def __init__(self): | |
print '============================================================' | |
print 'Running bloomberg_people.py' | |
for r in self.rel_coll.find(): | |
url = r['url_BW'] | |
url = url.replace(r'snapshot', r"people") | |
self.start_urls.append(url) | |
num = 0 | |
def parse(self, response): | |
html_parser = HTMLParser.HTMLParser() | |
sel = HtmlXPathSelector(response) | |
try: | |
urll = response.request.meta['redirect_urls'] | |
url = urll[0].replace(r'people', r'snapshot') | |
except KeyError: | |
url = response.url | |
url = url.replace(r'people', r'snapshot') | |
body = sel.xpath('//*[@id="keyExecs"]/*') | |
for b in body: | |
name = b.xpath('td[1]/span/a/text()').extract() | |
if len(name) >= 1: | |
name = html_parser.unescape(' '.join(name[0].splitlines()).strip()) | |
print name | |
else: | |
continue | |
relationship = b.xpath('td[2]/a/strong/text()').extract() | |
if len(relationship) >= 1: | |
relationship = html_parser.unescape(' '.join(relationship[0].splitlines()).strip()) | |
title = b.xpath('td[3]/text()').extract() | |
if len(title) >=1: | |
title = html_parser.unescape(' '.join(title[0].splitlines()).strip()) | |
age = b.xpath('td[4]/text()').extract() | |
if len(age) >= 1: | |
age = html_parser.unescape(' '.join(age[0].splitlines()).strip()) | |
self.rel_coll2.insert({"url" : url, "name" : name, "relationship" : relationship, "title" : title, "age": age}) | |
self.rel_coll.update({"link" : url}, | |
{"$set" : { | |
"people" : True | |
} | |
}) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'khoi' | |
from lxml.html import html_parser | |
from pymongo import MongoClient,ReadPreference | |
import string | |
import HTMLParser | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
import traceback | |
import amqp | |
from kombu import connections, BrokerConnection | |
import time | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support import expected_conditions as EC | |
import json | |
from dateutil.parser import parse | |
import re | |
import fuzzywuzzy | |
from fuzzywuzzy import fuzz | |
### | |
### THIS IS CURRENTLY OUR MAIN CRAWLER | |
### | |
class Spider2(): | |
alphabet = list(string.lowercase) | |
allowed_domains = ["http://investing.businessweek.com/"] | |
start_urls = [] | |
MAIN_DB_HOST = 'localhost' | |
MAIN_DB_PORT = 27017 | |
REL_DB = 'TRAIN' | |
REL_COLL = 'All_Companies' | |
exchange = '' | |
client = MongoClient(MAIN_DB_HOST, MAIN_DB_PORT) | |
read_preference = ReadPreference.SECONDARY | |
rel_coll = client[REL_DB][REL_COLL] | |
chromedriver = webdriver.Chrome() | |
def __init__(self): | |
pass | |
def parse(self, name_origin, name_BW, isin, ticker, list_date, sector, industry): | |
print '============================================================' | |
print 'Running crawler2.py' | |
try: | |
#clean up name | |
# name = name.lower().replace('corporation', '') | |
# name = name.lower().replace('ltd', '') | |
# name = name.lower().replace('limited', '') | |
# name = name.lower().replace('society', '') | |
# name = name.lower().replace('trus', '') | |
# name = name.lower().replace('plc', '') | |
flag = False | |
self.chromedriver.get("http://investing.businessweek.com/research/common/symbollookup/symbollookup.asp") | |
wait = WebDriverWait(self.chromedriver, 10).until( | |
EC.presence_of_element_located((By.XPATH, '//*[@id="textBox"]')) | |
) | |
search_box = self.chromedriver.find_element_by_xpath('//*[@id="textBox"]') | |
search_box.send_keys(name_BW) | |
go_button = self.chromedriver.find_element_by_xpath('//*[@id="controlTable"]/tbody/tr/td[4]/input') | |
go_button.click() | |
wait = WebDriverWait(self.chromedriver, 10).until( | |
EC.presence_of_element_located((By.XPATH, '//*[@id="columnLeft"]/table/tbody')) | |
) | |
time.sleep(1) | |
table = self.chromedriver.find_elements_by_xpath('//*[@id="columnLeft"]/table/tbody/*') | |
for each in table: | |
ticker_list = each.find_elements_by_tag_name('td') | |
try: | |
ticker1, market = ticker_list[0].text.split(':') | |
except: | |
break | |
#if ticker1.lower() == ticker.lower() and market == 'LN': | |
ticker = str(ticker).strip() | |
#ticker_filter = ticker.split('^')[0] | |
ticker_filter = ''.join(e for e in ticker if e.isalnum()) | |
ticker = ticker | |
ticker1 = str(ticker1).strip() | |
ticker1_filter = ''.join(e for e in ticker1 if e.isalnum()) | |
if fuzz.ratio(ticker_filter, ticker1_filter) > 70: | |
#if market == 'HK' or (ticker_list[2].text.strip() == 'Hong Kong'): | |
url = ticker_list[0].find_element_by_tag_name('a').get_attribute('href') | |
country = ticker_list[3].text | |
if list_date: | |
list_date = parse(str(int(list_date))) | |
self.rel_coll.insert({"url_BW":url, "ticker_BW": ticker1, "sector_exchange" : sector, "list_date" : list_date, | |
"ticker_exchange" : ticker, "isin" : isin, "market_BW" : market, "name_exchange" : name_origin, | |
"country" : country, "exchange" : self.exchange, "industry_exchange" : industry}) | |
flag = True | |
print "Insert ", ticker1, " with name: ", name_BW | |
continue | |
ticker_filter = re.split(r'[\\^|.|*]', ticker, 1)[0] | |
ticker1_filter = re.split(r'[\\^|.|*]', ticker1, 1)[0] | |
time.sleep(1) | |
return flag | |
except Exception, e: | |
print str(e) | |
time.sleep(1) | |
return False | |
print 'Finishing crawler2.py' | |
print '============================================================' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from w3lib.encoding import read_bom | |
__author__ = 'nhat' | |
import pandas as pd | |
import json | |
import traceback | |
import amqp | |
from kombu import connections, BrokerConnection | |
from new_queue import new_queue | |
import string | |
df = pd.read_csv('all_company.csv', encoding = 'utf-8') | |
df = df.fillna('') | |
data_ready = df[df['ready']==1] | |
# print name.index.tolist() | |
rbmqueue = new_queue(config={"username":"worker","password":"caydemcayngay", "host":"dev.ssh.sentifi.com"}, queue_name = "Test_Queues") | |
#data_ready = data_ready.name | |
j = data_ready.to_json(orient='records') | |
#name = df.name | |
# j = name.to_json(orient = 'records') | |
#data_ready.to_json('all_ready_company.json') | |
temp = json.loads(j) | |
for each in temp: | |
rbmqueue.post(json.dumps(each)) | |
# | |
# # Get data from queue | |
# data = rbmqueue.receive() | |
# Remove data from queue | |
#rbmqueue.delete(data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
id | name | new_name_BW | new_Company | new_symbol | ||
---|---|---|---|---|---|---|
2 | 1338 | 3i Group | 3i Group | 3i Group | III | |
3 | 1467 | 3i Infrastruct | 3i Infrastruct | 3i Infrastruct | 3IN | |
4 | 114 | 3M | 3M | 3M | MMM | |
33 | 1700 | Aareal Bank | Aareal Bank | Aareal Bank | ARL | |
51 | 396 | ABB | ABB | ABB | ABB | |
86 | 174 | Aberdeen | Aberdeen | Aberdeen | ADN | |
88 | 1468 | Aberforth Small | Aberforth Small | Aberforth Small | ASL | |
91 | 178 | ABF | ABF | ABF | ABF | |
140 | 529 | Abs. Return Fund | Abs. Return Fund | Abs. Return Fund | ||
221 | 1997 | Accu | Accu | Accu | ACUN |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
#--- Third-Party Libraries | |
from scrapy.spider import Spider | |
from scrapy.selector import Selector | |
from scrapy.http import Request | |
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
from pymongo import MongoClient,ReadPreference | |
#--- Python Libraries | |
import re | |
from twisted.internet import reactor | |
from scrapy.crawler import Crawler | |
from scrapy.settings import Settings | |
from scrapy import log, signals | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy.utils.project import get_project_settings | |
# | |
class my_Spider(Spider): | |
name = "ubs" | |
allowed_domains = ["ubs.com"] | |
start_urls = [] | |
url = 'https://www.ubs.com/global/en/asset_management/glossary/' | |
def __init__(self): | |
for i in 'ab': | |
self.start_urls.append(self.url + i) | |
print self.url + i | |
count = 0 | |
def parse(self, response): | |
self.count = self.count +1 | |
print 'Spider 1++++++++++++++++++++++',self.count | |
# hxs = HtmlXPathSelector(response) | |
# words = hxs.select("//table[@class='tableHighlight ']/tbody/tr/th/p//text()").extract() | |
# definitions = hxs.select("//table[@class='tableHighlight ']/tbody/tr/td/p//text()").extract() | |
# | |
# for i in range(0,len(words)): | |
# | |
# print definitions[i] | |
# print words[i] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
#--- Third-Party Libraries | |
from scrapy.spider import Spider | |
from scrapy.selector import Selector | |
from scrapy.http import Request | |
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
from pymongo import MongoClient,ReadPreference | |
#--- Python Libraries | |
import re | |
from twisted.internet import reactor | |
from scrapy.crawler import Crawler | |
from scrapy.settings import Settings | |
from scrapy import log, signals | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy.utils.project import get_project_settings | |
# | |
class my_Spider(Spider): | |
name = "ubs" | |
allowed_domains = ["ubs.com"] | |
start_urls = [] | |
url = 'https://www.ubs.com/global/en/asset_management/glossary/' | |
def __init__(self): | |
for i in 'abcdefghijklmnopqrstuvwxyz': | |
self.start_urls.append(self.url + i) | |
print self.url + i | |
count1 = 0 | |
def parse(self, response): | |
self.count1 = self.count1 +1 | |
print 'Spider 2++++++++++++++++++++++',self.count1 | |
# hxs = HtmlXPathSelector(response) | |
# words = hxs.select("//table[@class='tableHighlight ']/tbody/tr/th/p//text()").extract() | |
# definitions = hxs.select("//table[@class='tableHighlight ']/tbody/tr/td/p//text()").extract() | |
# | |
# for i in range(0,len(words)): | |
# | |
# print definitions[i] | |
# print words[i] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
__author__ = 'khoi' | |
import string | |
import re | |
import selenium | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support import expected_conditions as EC | |
from pymongo import MongoClient,ReadPreference | |
import urllib | |
import HTMLParser | |
if __name__ == "__main__": | |
print '============================================================' | |
print 'Running headquater_google_crawler.py' | |
MAIN_DB_HOST = 'localhost' | |
MAIN_DB_PORT = 27017 | |
REL_DB = 'TRAIN' | |
REL_COLL = 'All_Companies' | |
start_urls = [] | |
client = MongoClient(MAIN_DB_HOST, MAIN_DB_PORT) | |
read_preference = ReadPreference.SECONDARY | |
rel_coll = client[REL_DB][REL_COLL] | |
html_parser = HTMLParser.HTMLParser() | |
for each in rel_coll.find(): | |
name = each['name_exchange'].strip() | |
name = urllib.quote(name) | |
name = name.replace(" ", "+") | |
name = name + '+headquarter' | |
start_urls += ['https://www.google.com/search?q={0}&oq={1}&aqs=chrome..69i57j0l5.486j0j7&sourceid=chrome&es_sm=0&ie=UTF-8'.format(name, name)] | |
chromedriver = webdriver.Chrome() | |
for each in start_urls: | |
try: | |
chromedriver.get(each) | |
hq = chromedriver.find_element_by_xpath('//*[@id="rso"]/li/div[1]/div/div[1]/ol/li/div/div[1]/a') | |
if hq: | |
headquarter = hq.text | |
headquarter = headquarter.replace('\n', ' ') | |
headquarter = ' '.join(headquarter.split()) | |
else: | |
headquarter = "" | |
keyword = chromedriver.find_element_by_xpath('//*[@id="gbqfq"]') | |
keyword = keyword.get_attribute('value') | |
keyword = keyword.replace('headquarter', ' ') | |
keyword = html_parser.unescape(' '.join(keyword.splitlines()).strip()) | |
print 'XXX' | |
print headquarter | |
print keyword | |
print 'YYY' | |
rel_coll.update({'name_exchange' : keyword}, {'$set' : {'headquarter' : headquarter}}) | |
time.sleep(15) | |
except Exception, e: | |
print e | |
time.sleep(5) | |
continue | |
print 'Finishing headquater_google_crawler.py' | |
print '============================================================' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'khoi' | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.keys import Keys | |
from pymongo import MongoClient,ReadPreference | |
import string | |
import time | |
import re | |
from selenium.common.exceptions import NoSuchElementException | |
from pymongo import Connection | |
from util import MongoRepo | |
if __name__ == "__main__": | |
chromedriver = webdriver.Chrome() | |
MAIN_DB_HOST = '192.168.1.63' | |
MAIN_DB_PORT = 27017 | |
exchange = 'NYSE:' | |
rel_coll = MongoRepo(Connection('192.168.1.63')).use('BW', 'Companies_NYSE') | |
url = '' | |
tickers = [] | |
chromedriver.get("http://www.advfn.com/exchanges/NGI/NYA/stock-price") | |
for each in rel_coll.find({'name_quotenet' : {'$exists' : 0}, 'name_advfn' : {'$exists' : 0}}): | |
ticker = each['ticker_exchange'] | |
tickers += [ticker] | |
for ticker in tickers: | |
chromedriver.get("http://www.advfn.com/exchanges/NGI/NYA/stock-price") | |
url2 = url | |
flag = False | |
while (url2 == url): | |
print 'ticker: ', ticker | |
count = 0 | |
try: | |
if count >5: | |
flag = True | |
break | |
input = chromedriver.find_element_by_xpath('//*[@id="symbol_entry"]') | |
input.clear() | |
input.send_keys(exchange + ticker) | |
search_button = chromedriver.find_element_by_xpath('//*[@id="symbol_ok"]') | |
search_button.click() | |
time.sleep(5) | |
except: | |
chromedriver.get("http://www.advfn.com/exchanges/NGI/NYA/stock-price") | |
continue | |
url = chromedriver.current_url | |
try: | |
wait = WebDriverWait(chromedriver, 10).until( | |
EC.presence_of_element_located((By.XPATH, '//*[@id="quoteElementPiece2"]')) | |
) | |
name_advfn = chromedriver.find_element_by_xpath('//*[@id="content"]/div[3]/table/tbody/tr[2]/td[1]/b') | |
if name_advfn: | |
name_advfn = name_advfn.text | |
else: | |
name_advfn = '' | |
isin = chromedriver.find_element_by_xpath('//*[@id="quoteElementPiece2"]') | |
if isin: | |
isin = isin.text | |
else: | |
isin = '' | |
print 'XXX' | |
print ticker | |
print name_advfn | |
print isin | |
print 'YYY' | |
rel_coll.update({'ticker_exchange' : ticker}, { | |
'isin' : isin, 'name_advfn' : name_advfn | |
}) | |
except Exception, e: | |
print e | |
time.sleep(3) | |
continue | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from bson import ObjectId | |
__author__ = 'khoi' | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.common.keys import Keys | |
from pymongo import MongoClient,ReadPreference | |
import string | |
import time | |
import re | |
from selenium.common.exceptions import NoSuchElementException | |
from pymongo import Connection | |
from util import MongoRepo | |
if __name__ == "__main__": | |
chromedriver = webdriver.Chrome() | |
MAIN_DB_HOST = '192.168.1.63' | |
MAIN_DB_PORT = 27017 | |
REL_DB = 'BW' | |
REL_COLL = 'Companies_AMEX' | |
client = MongoClient(MAIN_DB_HOST, MAIN_DB_PORT) | |
read_preference = ReadPreference.SECONDARY | |
rel_coll = MongoRepo(Connection('localhost')).use('BW', 'temp') | |
#rel_coll = client[REL_DB][REL_COLL] | |
url = '' | |
tickers = [] | |
chromedriver.get("http://www.quotenet.com/stock-quotes") | |
for each in rel_coll.find({'name_quotenet' : {'$exists' : 0}}): | |
ticker = each['ticker_exchange'] | |
tickers += [ticker] | |
print len(tickers) | |
for ticker in tickers: | |
url2 = url | |
flag = False | |
if len(ticker) >1: | |
if ticker == 'CBSTZ' or ticker == 'CELGZ': | |
continue | |
while (url2 == url): | |
print 'ticker: ', ticker | |
count = 0 | |
try: | |
if count > 5: | |
flag = True | |
break | |
input = chromedriver.find_element_by_xpath('//*[@id="searchvalue"]') | |
input.clear() | |
input.send_keys(ticker) | |
button = chromedriver.find_element_by_xpath('//*[@id="site"]/div[2]/div[2]/div/form/input') | |
button.click() | |
time.sleep(5) | |
count += 1 | |
except: | |
chromedriver.get("http://www.quotenet.com/stock-quotes") | |
continue | |
url = chromedriver.current_url | |
try: | |
#case 1: jump right to the company we want to find. Sướng quá sướng quá | |
if not flag: | |
info_e = chromedriver.find_element_by_xpath('//*[@id="site"]/div[4]/div[2]/div/div[1]/h2/a') | |
info = info_e.text | |
name = re.findall(r'(.*)? \[', info) | |
isin = re.findall(r'ISIN: (.*)?]', info) | |
if name: | |
name = name[0] | |
else: | |
name = None | |
if isin: | |
isin = isin[0] | |
else : | |
isin = None | |
rel_coll.update({'ticker_exchange' : ticker}, {'isin' : isin, 'name_quotenet' : name}) | |
print 'isin: ', isin | |
except NoSuchElementException: | |
print "tinh sau de" | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
from twisted.internet import reactor | |
from scrapy.crawler import Crawler | |
from scrapy.settings import Settings | |
from scrapy import log, signals | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy.utils.project import get_project_settings | |
from demo_myspider import my_Spider as bw1 | |
from demo_myspider2 import my_Spider as bw2 | |
import time | |
class ReactorControl: | |
def __init__(self): | |
self.crawlers_running = 0 | |
def add_crawler(self): | |
self.crawlers_running += 1 | |
print '+++++++++++++++++++',self.crawlers_running | |
def remove_crawler(self): | |
self.crawlers_running -= 1 | |
print '-----------------',self.crawlers_running | |
if self.crawlers_running == 0: | |
reactor.stop() | |
def setup_crawler1(): | |
print 'ok1' | |
crawler = Crawler(settings) | |
crawler.configure() | |
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) | |
spider = bw1() | |
crawler.crawl(spider) | |
reactor_control.add_crawler() | |
crawler.start() | |
def setup_crawler2(): | |
print 'ok2' | |
crawler = Crawler(settings) | |
crawler.configure() | |
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) | |
spider = bw2() | |
crawler.crawl(spider) | |
reactor_control.add_crawler() | |
crawler.start() | |
if __name__ == "__main__": | |
print '========================================' | |
print 'Running all spiders1 to get data in BW' | |
reactor_control = ReactorControl() | |
settings = get_project_settings() | |
crawler = Crawler(settings) | |
setup_crawler1() | |
setup_crawler2() | |
reactor.run() | |
print 'Finishing all spiders1 to get data in BW' | |
print '========================================' | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
from twisted.internet import reactor | |
from scrapy.crawler import Crawler | |
from scrapy.settings import Settings | |
from scrapy import log, signals | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy.utils.project import get_project_settings | |
from demo_myspider2 import my_Spider | |
def stop_reactor(): | |
reactor.stop() | |
def setup_crawler1(): | |
dispatcher.connect(stop_reactor, signal=signals.spider_closed) | |
spider = my_Spider() | |
settings = get_project_settings() | |
crawler = Crawler(Settings()) | |
crawler.configure() | |
crawler.crawl(spider) | |
crawler.start() | |
reactor.run() | |
if __name__ == "__main__": | |
print '========================================' | |
print 'Running all spiders2 to get data in BW' | |
# Scrapy spiders script... | |
setup_crawler1() | |
print 'Finishing all spiders2 to get data in BW' | |
print '========================================' | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
from twisted.internet import reactor | |
from scrapy.crawler import Crawler | |
from scrapy.settings import Settings | |
from scrapy import log, signals | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy.utils.project import get_project_settings | |
from demo_myspider2 import my_Spider | |
if __name__ == "__main__": | |
print '========================================' | |
print 'Running all spiders2 to get data in BW' | |
# Scrapy spiders script... | |
execfile("my_calling.py") | |
print '++++++++++++++++++++++++++++++run' | |
execfile("my_calling2.py") | |
print 'Finishing all spiders2 to get data in BW' | |
print '========================================' | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from twisted.internet import reactor | |
from scrapy.crawler import Crawler | |
from scrapy import log, signals | |
from scrapy.utils.project import get_project_settings | |
from demo_myspider import my_Spider as bw1 | |
from demo_myspider import my_Spider as bw2 | |
class ReactorControl: | |
def __init__(self): | |
self.crawlers_running = 0 | |
def add_crawler(self): | |
self.crawlers_running += 1 | |
def remove_crawler(self): | |
self.crawlers_running -= 1 | |
if self.crawlers_running == 0 : | |
reactor.stop() | |
def setup_crawler(): | |
crawler = Crawler(settings) | |
crawler.configure() | |
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) | |
spider = bw1() | |
crawler.crawl(spider) | |
reactor_control.add_crawler() | |
crawler.start() | |
def setup_crawler2(): | |
crawler = Crawler(settings) | |
crawler.configure() | |
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) | |
spider = bw2() | |
crawler.crawl(spider) | |
reactor_control.add_crawler() | |
crawler.start() | |
reactor_control = ReactorControl() | |
log.start() | |
settings = get_project_settings() | |
crawler = Crawler(settings) | |
setup_crawler() | |
setup_crawler2() | |
reactor.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
import traceback | |
import amqp | |
from kombu import connections, BrokerConnection | |
class Message(): | |
def __init__(self): | |
pass | |
body = None | |
tag = None | |
class new_queue(): | |
def __init__(self, config, queue_name): | |
#broker_url = 'amqp://worker:caydemcayngay@dev.ssh.sentifi.com:5672/' | |
broker_url = 'amqp://{0}:{1}@{2}:5672/'.format(config['username'], config['password'], config['host']) | |
self.connection = connections[BrokerConnection(broker_url)].acquire(block=True) | |
self.channel = self.connection.channel() | |
self.queue_name = queue_name | |
#self.is_exchange = queue_name.endswith('Exchange') | |
self.no_ack = False | |
def receive(self): | |
""" | |
Receive a message from queue | |
:return: | |
""" | |
response = self.channel.basic_get(self.queue_name, no_ack=self.no_ack) | |
if response is None: | |
return None | |
message = Message() | |
message.body = response.body | |
message.tag = response | |
return message | |
def delete(self, message): | |
""" | |
Delete message | |
:param message: | |
:return: | |
""" | |
self.channel.basic_ack(message.tag.delivery_tag) | |
def reject(self, message): | |
""" | |
Reject message | |
:param message: | |
:return: | |
""" | |
self.channel.basic_reject(message.tag.delivery_tag, True) | |
def post(self, text): | |
""" | |
Post message to an exchange | |
:param text: | |
:return: | |
""" | |
# if not self.is_exchange: | |
# print "Only post to exchange" | |
# return False | |
message = amqp.Message(text) | |
while True: | |
try: | |
self.channel.basic_publish(msg=message, exchange=self.queue_name) | |
break | |
except Exception as e: | |
print 'Post message to queue failed with exception' | |
print traceback.format_exc(e) | |
continue | |
return True | |
def close(self): | |
self.channel.close() | |
self.connection.close() | |
# | |
# | |
# # Push all data to RabitMQueues | |
# broker_url = 'amqp://worker:caydemcayngay@dev.ssh.sentifi.com:5672/' | |
# connection = connections[BrokerConnection(broker_url)].acquire(block=True) | |
# channel = connection.channel() | |
# for each in temp: | |
# message = amqp.Message(each) | |
# while True: | |
# try: | |
# channel.basic_publish(msg=message, exchange='Test_Queues') | |
# break | |
# except Exception as e: | |
# print 'Post message to queue failed with exception' | |
# print traceback.format_exc(e) | |
# continue | |
# # Push all data to RabitMQueues | |
# | |
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
import json | |
import pandas as pd | |
from pymongo import MongoClient,ReadPreference | |
MAIN_DB_HOST = 'localhost' | |
MAIN_DB_PORT = 27017 | |
REL_DB = 'TRAIN' | |
REL_COLL = 'All_Companies' | |
client = MongoClient(MAIN_DB_HOST, MAIN_DB_PORT) | |
read_preference = ReadPreference.SECONDARY | |
rel_coll = client[REL_DB][REL_COLL] | |
df = pd.read_csv('all_company.csv', encoding = 'utf-8') | |
df = df.fillna('') | |
data_ready = df[df['ready']==1] | |
# print name.index.tolist() | |
#name = df.name | |
j = data_ready.to_json(orient = 'records') | |
#data_ready.to_json('all_ready_company.json') | |
temp = json.loads(j) | |
for each in temp: | |
rel_coll.insert(each) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
from urlparse import urlparse | |
from scrapy.http import Request, HtmlResponse | |
from scrapy.spider import Spider | |
from scrapy.selector import Selector | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from lxml.html import html_parser | |
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
from pymongo import MongoClient,ReadPreference | |
import string | |
import HTMLParser | |
class FollowAllSpider(Spider): | |
name = 'followall' | |
allowed_domains = ["http://investing.businessweek.com/"] | |
start_urls = [] | |
MAIN_DB_HOST1 = 'localhost' | |
MAIN_DB_PORT = 27017 | |
REL_DB = 'TRAIN' | |
REL_COLL = 'All_Companies' | |
def __init__(self, **kw): | |
super(FollowAllSpider, self).__init__(**kw) | |
url = kw.get('url') | |
self.start_urls.append(url) | |
def parse(self, response): | |
html_parser = HTMLParser.HTMLParser() | |
sel = HtmlXPathSelector(response) | |
url = str(response.url) | |
print url | |
sector = sel.select('//*[@id="columnLeft"]/div/div[2]/div[1]/div[1]/div[1]/h2/a/text()').extract() | |
if sector: | |
sector = html_parser.unescape(' '.join(sector[0].splitlines()).strip()) | |
sector = sector.lower().replace('sector', '').strip() | |
industry = sel.select('//*[@id="columnLeft"]/div/div[2]/div[1]/div[1]/div[3]/h2/a/text()').extract() | |
if industry: | |
industry = html_parser.unescape(' '.join(industry[0].splitlines()).strip()) | |
industry = industry.lower().replace('industry', '').strip() | |
name = sel.select('//*[@id="columnLeft"]/div/h2[1]/span[1]/text()').extract() | |
name = html_parser.unescape(' '.join(name[0].splitlines()).strip()) | |
print 'XXXXX' | |
print name | |
print sector | |
print industry | |
print 'YYYYY' | |
if not sector: | |
sector = None | |
if not industry: | |
industry = None | |
self.rel_coll.update({"url_BW" : url}, | |
{"$set" : { | |
"sector_BW" : sector, | |
"industry_BW" : industry, | |
"name_BW" : name | |
}}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
import amqp | |
from amqp import Message | |
from kombu import connections, BrokerConnection | |
from crawler2 import Spider2 | |
import json | |
from new_queue import new_queue | |
import pandas as pd | |
import subprocess | |
import app | |
if __name__ == "__main__": | |
rabbitMQ = new_queue(config={"username":"worker","password":"caydemcayngay", "host":"dev.ssh.sentifi.com"}, queue_name = "Test_Queues") | |
data_push = False | |
print '=======================================' | |
print 'Start : Push Data' | |
#Data to push on RabbitMQ | |
# Should check data in RabbitMQ or NOT | |
data_push = pd.read_csv('demo_company.csv', encoding = 'utf-8') | |
# data_ready = data_push.fillna('') | |
# data_ready = data_push[data_push['ready']==1] | |
# data_ready['new_name_BW'] = data_ready['name'] | |
# data_ready['new_Company'] = data_ready['name'] | |
# data_ready['new_symbol'] = data_ready['ticker'] | |
# | |
# data_ready.to_csv('demo_company.csv', encoding='utf-8') | |
j = data_push.to_json(orient = 'records') | |
temp = json.loads(j) | |
for each in temp: | |
rabbitMQ.post(json.dumps(each)) | |
#Data to push on RabbitMQ | |
print 'End : Push Data' | |
print '=======================================' | |
print '========================================' | |
print 'Start : Run App.py and Crawler2.py' | |
#Run App.py to crawl name_origin, name_BW, isin, ticker,list_date, sector, industry | |
#In App.py also call Crawler.py to get Ticker , ISIN of Company | |
execfile("app.py") | |
print 'End : Run App.py and Crawler2.py' | |
print '========================================' | |
print '========================================' | |
print 'Start: Get all things from BW' | |
# Run Bloomberg_Detail to get more all information from Business Week by using url bw_company | |
execfile("Setup_Crawler.py") | |
print 'End: Get all things from BW' | |
print '========================================' | |
print '========================================' | |
print 'Start: Get headquater from Google' | |
# Run headquater to get headquater information of a company by using google | |
execfile("headquater_google_crawler.py") | |
print 'End: Get headquater from Google' | |
print '========================================' | |
# | |
# subprocess.call(['java', '-jar', 'sf-crawler-1.0.0-SNAPSHOT.jar']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'nhat' | |
from twisted.internet import reactor | |
from scrapy.crawler import Crawler | |
from scrapy.settings import Settings | |
from scrapy import log, signals | |
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy.utils.project import get_project_settings | |
from demo_myspider import my_Spider | |
from bloomberg_detail import DmozSpider as bw1 | |
from bloomberg_detail2 import DmozSpider as bw2 | |
from bloomberg_people import DmozSpider as bw3 | |
class ReactorControl: | |
def __init__(self): | |
self.crawlers_running = 0 | |
def add_crawler(self): | |
self.crawlers_running += 1 | |
def remove_crawler(self): | |
self.crawlers_running -= 1 | |
if self.crawlers_running == 0: | |
reactor.stop() | |
def setup_crawler1(): | |
crawler = Crawler(settings) | |
crawler.configure() | |
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) | |
spider = bw1() | |
crawler.crawl(spider) | |
reactor_control.add_crawler() | |
crawler.start() | |
def setup_crawler2(): | |
crawler = Crawler(settings) | |
crawler.configure() | |
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) | |
spider = bw2() | |
crawler.crawl(spider) | |
reactor_control.add_crawler() | |
crawler.start() | |
def setup_crawler3(): | |
crawler = Crawler(settings) | |
crawler.configure() | |
crawler.signals.connect(reactor_control.remove_crawler, signal=signals.spider_closed) | |
spider = bw3() | |
crawler.crawl(spider) | |
reactor_control.add_crawler() | |
crawler.start() | |
if __name__ == "__main__": | |
print '========================================' | |
print 'Running all spiders to get data in BW' | |
# Scrapy spiders script... | |
reactor_control = ReactorControl() | |
settings = get_project_settings() | |
crawler = Crawler(settings) | |
#Create crawler | |
setup_crawler1() | |
setup_crawler2() | |
setup_crawler3() | |
reactor.run() | |
print 'Finishing all spiders to get data in BW' | |
print '========================================' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PK | |