Created
July 26, 2021 01:24
-
-
Save KunstDerFuge/66321f7942bb910dee948309bfa169a2 to your computer and use it in GitHub Desktop.
Scrape 8chan Q threads from archive.is
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from selenium import webdriver | |
import json | |
import glob | |
from random import randrange | |
import time | |
with open("qthreads.json") as json_file: | |
data = json.load(json_file) | |
# If you want to split the job, do something like this: | |
# data['8ch']['qresearch'] = data['8ch']['qresearch'][:569] | |
def do_scrape(): | |
options = webdriver.FirefoxOptions() | |
options.set_headless() | |
driver = webdriver.Firefox(firefox_options=options) | |
for platform, boards in data.items(): | |
for board in boards: | |
files = glob.glob(f'data/{board}*.csv') | |
for thread in data[platform][board]: | |
if f'data/{board}_{thread}.csv' in files: | |
print(f'Already scraped /{board}/{thread}...') | |
continue | |
if platform == '4ch' or platform == '8kun': | |
continue | |
try: | |
thread_scrape = pd.DataFrame(columns=['platform', 'board', 'thread_no', 'header', 'body']) | |
url = f"https://archive.is/newest/https://8ch.net/{board}/res/{thread}.html" | |
driver.get(url) | |
op = driver.find_element_by_css_selector('form > div:nth-of-type(1) > div:nth-of-type(2)') | |
comments = driver.find_elements_by_css_selector('form > div > div:nth-of-type(n+3)') | |
# OP | |
thread_scrape = thread_scrape.append({ | |
'platform': '8chan', | |
'board': board, | |
'thread_no': thread, | |
'header': op.find_element_by_css_selector('div:nth-of-type(1)').get_attribute('innerHTML'), | |
'body': op.find_element_by_css_selector('div:nth-of-type(2)').get_attribute('innerHTML') | |
}, ignore_index=True) | |
# Comments | |
for comment in comments: | |
thread_scrape = thread_scrape.append({ | |
'platform': '8chan', | |
'board': board, | |
'thread_no': thread, | |
'header': comment.find_element_by_css_selector('div:nth-of-type(1)').get_attribute('innerHTML'), | |
'body': comment.find_element_by_css_selector('div:nth-of-type(3)').get_attribute('innerHTML') | |
}, ignore_index=True) | |
print('Scraped {} posts from {}/{}...'.format(len(thread_scrape), board, thread)) | |
thread_scrape.to_csv('data/{}_{}.csv'.format(board, thread)) | |
print('Wrote {}_{}.csv'.format(board, thread)) | |
if randrange(6) == 0: | |
print('Randomly waiting 10 seconds...') | |
time.sleep(10) | |
else: | |
time.sleep(randrange(1, 4)) | |
except Exception as e: | |
print('Exception scraping {}/{}...'.format(board, thread)) | |
print(e) | |
try: | |
# Did we get a Captcha redirect? | |
captcha = driver.find_element_by_css_selector('h2 span:nth-of-type(1)') | |
if 'Please complete the security check' in captcha.text: | |
driver.quit() | |
print('Got Captcha redirect; restarting...') | |
time.sleep(20) | |
return False | |
except Exception as e: | |
# Not Captcha | |
pass | |
return True | |
done = False | |
while not done: | |
done = do_scrape() | |
time.sleep(1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment