Skip to content

Instantly share code, notes, and snippets.

@KunstDerFuge
Last active July 24, 2021 04:14
Show Gist options
  • Save KunstDerFuge/9b01ce6736f3f2a1a118a8de044415ea to your computer and use it in GitHub Desktop.
Save KunstDerFuge/9b01ce6736f3f2a1a118a8de044415ea to your computer and use it in GitHub Desktop.
import scrapy
import json
class ChanSpider(scrapy.Spider):
name = 'chanspider'
custom_settings = {
'CONCURRENT_REQUESTS': 1,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 5,
'COOKIES_ENABLED': True,
'COOKIES_DEBUG': True
}
def __init__(self, *args, **kwargs):
super(ChanSpider, self).__init__(*args, **kwargs)
with open("qthreads.json") as json_file:
data = json.load(json_file)
self.start_urls = ['https://archive.is']
for platform, boards in data.items():
for board in boards:
for thread in data[platform][board]:
if platform == '4ch' or platform == '8kun':
continue
self.start_urls.append(
f"https://archive.is/newest/https://8ch.net/{board}/res/{thread}.html"
)
print(self.start_urls)
def start_requests(self):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Cookie': 'tmr_reqNum=115; tmr_lvid=882819a95acb02353a36c79e21127f41; tmr_lvidTS=1626648079850; _ga=GA1.2.661111166.1626650423; _ym_uid=1626650435843066809; _ym_d=1626650435; _ga=GA1.2.661111166.1627098268; tmr_detect=0%7C1627096756374',
'Host': 'archive.is',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'TE': 'trailers',
'Upgrade-Insecure-Requests': 1,
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
}
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse, headers=headers)
def parse(self, response):
board = response.request.url.split('/')[-3]
thread_no = response.request.url.split('/')[-1].split('.')[0]
for post in response.css('div.post'):
post_no = post.css('a.post_no:nth-of-type(3)::text').get()
if post_no is None:
post_no = thread_no
yield {
'name': post.css('span.name::text').get(),
'subject': post.css('span.subject::text').get(),
'timestamp': post.css('time').attrib['datetime'],
'poster_id': post.css('span.poster_id::text').get(),
'board': board,
'thread_no': thread_no,
'post_no': post_no,
'tripcode': post.css('span.trip::text').get(),
'body_text': post.css('div.body').get()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment