Skip to content

Instantly share code, notes, and snippets.

@nokados
Last active October 22, 2018 16:04
Show Gist options
  • Save nokados/ba1d5091a00dd7cedf24364470c987d6 to your computer and use it in GitHub Desktop.
Save nokados/ba1d5091a00dd7cedf24364470c987d6 to your computer and use it in GitHub Desktop.
import json, requests, time
import datetime
import pickle
from collections import Counter
from pyquery import PyQuery
from readability import Document
from lxml.etree import XMLSyntaxError, LxmlError
from readability.readability import Unparseable
from requests.adapters import MaxRetryError
from requests.exceptions import ConnectionError
from urllib3.exceptions import NewConnectionError
from urllib.parse import urlparse
from threading import Thread, Semaphore
import http.client as httplib, sys
from queue import Queue
import sqlite3
import os
def get_posts_fast_api(post_urllist):
posts = {}
errors = []
checked_urls = []
concurrent = 70
sem = Semaphore()
def doWork():
while True:
story_id, story = q.get()
url = story['url']
checked_urls.append(url)
status, data = getStatus(url)
if status == 200:
tryAddPost(story_id, data)
q.task_done()
def getStatus(ourl):
try:
url = urlparse(ourl)
if url.scheme == 'https':
conn = httplib.HTTPSConnection(url.netloc)
elif url.scheme == 'http':
conn = httplib.HTTPConnection(url.netloc)
else:
raise Exception('Incorrect protocol: {}'.format(url.scheme))
conn.request("GET", url.path)
res = conn.getresponse()
return res.status, res.read()
except Exception as e:
print(e, ourl)
errors.append(ourl)
return "error", None
def tryAddPost(story_id, data):
try:
d = Document(data)
text = PyQuery(d.summary()).text()
except (Unparseable, LxmlError):
print('{} has incorrect xml'.format(story_id))
if not text:
print('ERR: Could not get the post({}) text'.format(story_id))
return
with sem:
with open('posts/{}.txt'.format(story_id), 'w') as f:
f.write(text)
print('Started at {}'.format(time.strftime('%X %x')))
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=doWork)
t.daemon = True
t.start()
try:
for story_id, story in post_urllist.items():
q.put((story_id, story))
q.join()
except KeyboardInterrupt:
print('Keyboard interrupt')
print('Finished at {}'.format(time.strftime('%X %x')))
num_posts = len(os.listdir('posts'))
print('Got {} posts'.format(num_posts))
return posts, errors, checked_urls
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment