git
discard all local changes/commits and pull from upstream
git reset --hard origin/master
git pull origin master
$ scrapy runspider txspider.py | |
2016-07-05 23:11:39 [scrapy] INFO: Scrapy 1.1.0 started (bot: scrapybot) | |
2016-07-05 23:11:39 [scrapy] INFO: Overridden settings: {} | |
2016-07-05 23:11:40 [scrapy] INFO: Enabled extensions: | |
['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.logstats.LogStats'] | |
2016-07-05 23:11:40 [scrapy] INFO: Enabled downloader middlewares: | |
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', | |
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', | |
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', | |
'scrapy.downloadermiddlewares.retry.RetryMiddleware', |
sudo apt-get install -y python-dev python-pip libxml2-dev libxslt1-dev zlib1g-dev libffi-dev libssl-dev | |
pip install virtualenv |
git
discard all local changes/commits and pull from upstream
git reset --hard origin/master
git pull origin master
from scrapy.xlib.pydispatch import dispatcher | |
from scrapy import signals | |
from scrapy.exceptions import DropItem | |
from scrapy.utils.serialize import ScrapyJSONEncoder | |
from carrot.connection import BrokerConnection | |
from carrot.messaging import Publisher | |
from twisted.internet.threads import deferToThread |
sudo apt-get update | |
sudo apt-get install -y python-dev python-pip libxml2-dev libxslt1-dev zlib1g-dev libffi-dev libssl-dev | |
pip install Scrapy | |
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv EA312927 | |
echo "deb http://repo.mongodb.org/apt/ubuntu trusty/mongodb-org/3.2 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.2.list | |
sudo apt-get update | |
sudo apt-get install -y mongodb-org | |
sudo service mongod start | |
pip install pymongo |
# -x, --proxy <[protocol://][user:password@]proxyhost[:port]> | |
# | |
# Use the specified HTTP proxy. | |
# If the port number is not specified, it is assumed at port 1080. | |
curl -x http://proxy_server:proxy_port --proxy-user username:password -L http://url |
import datetime | |
# datetime objest to string | |
datetime.datetime.today().strftime("%m/%d/%Y %H:%M") | |
# from string to datetime object | |
datetime.datetime.strptime('Mar 22, 2016 00:00', "%Y%m%d %H:%M") | |
# плюс день | |
datetime.datetime.today() + datetime.timedelta(days=1) |
# Following next tag | |
response.xpath('//h2[contains(text(), "Contact information")]/following::table[1]//text()').extract() | |
# Tag contains | |
//person[contains(firstname, 'Kerr') and contains(lastname, 'och')] |
# Compiled source # | |
################### | |
*.com | |
*.class | |
*.dll | |
*.exe | |
*.o | |
*.so | |
*.pyc |
# http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow | |
def download_file(url): | |
local_filename = url.split('/')[-1] | |
# NOTE the stream=True parameter | |
r = requests.get(url, stream=True) | |
with open(local_filename, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
return local_filename |