Skip to content

Instantly share code, notes, and snippets.

@KunstDerFuge
Created July 26, 2021 01:24
Show Gist options
  • Save KunstDerFuge/66321f7942bb910dee948309bfa169a2 to your computer and use it in GitHub Desktop.
Save KunstDerFuge/66321f7942bb910dee948309bfa169a2 to your computer and use it in GitHub Desktop.
Scrape 8chan Q threads from archive.is
import pandas as pd
from selenium import webdriver
import json
import glob
from random import randrange
import time
with open("qthreads.json") as json_file:
data = json.load(json_file)
# If you want to split the job, do something like this:
# data['8ch']['qresearch'] = data['8ch']['qresearch'][:569]
def do_scrape():
options = webdriver.FirefoxOptions()
options.set_headless()
driver = webdriver.Firefox(firefox_options=options)
for platform, boards in data.items():
for board in boards:
files = glob.glob(f'data/{board}*.csv')
for thread in data[platform][board]:
if f'data/{board}_{thread}.csv' in files:
print(f'Already scraped /{board}/{thread}...')
continue
if platform == '4ch' or platform == '8kun':
continue
try:
thread_scrape = pd.DataFrame(columns=['platform', 'board', 'thread_no', 'header', 'body'])
url = f"https://archive.is/newest/https://8ch.net/{board}/res/{thread}.html"
driver.get(url)
op = driver.find_element_by_css_selector('form > div:nth-of-type(1) > div:nth-of-type(2)')
comments = driver.find_elements_by_css_selector('form > div > div:nth-of-type(n+3)')
# OP
thread_scrape = thread_scrape.append({
'platform': '8chan',
'board': board,
'thread_no': thread,
'header': op.find_element_by_css_selector('div:nth-of-type(1)').get_attribute('innerHTML'),
'body': op.find_element_by_css_selector('div:nth-of-type(2)').get_attribute('innerHTML')
}, ignore_index=True)
# Comments
for comment in comments:
thread_scrape = thread_scrape.append({
'platform': '8chan',
'board': board,
'thread_no': thread,
'header': comment.find_element_by_css_selector('div:nth-of-type(1)').get_attribute('innerHTML'),
'body': comment.find_element_by_css_selector('div:nth-of-type(3)').get_attribute('innerHTML')
}, ignore_index=True)
print('Scraped {} posts from {}/{}...'.format(len(thread_scrape), board, thread))
thread_scrape.to_csv('data/{}_{}.csv'.format(board, thread))
print('Wrote {}_{}.csv'.format(board, thread))
if randrange(6) == 0:
print('Randomly waiting 10 seconds...')
time.sleep(10)
else:
time.sleep(randrange(1, 4))
except Exception as e:
print('Exception scraping {}/{}...'.format(board, thread))
print(e)
try:
# Did we get a Captcha redirect?
captcha = driver.find_element_by_css_selector('h2 span:nth-of-type(1)')
if 'Please complete the security check' in captcha.text:
driver.quit()
print('Got Captcha redirect; restarting...')
time.sleep(20)
return False
except Exception as e:
# Not Captcha
pass
return True
done = False
while not done:
done = do_scrape()
time.sleep(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment