KunstDerFuge/scrape_8chan_threads.py

## scrape_8chan_threads.py
import pandas as pd
from selenium import webdriver
import json
import glob
from random import randrange
import time


with open("qthreads.json") as json_file:
            data = json.load(json_file)
            # If you want to split the job, do something like this:
            # data['8ch']['qresearch'] = data['8ch']['qresearch'][:569]


def do_scrape():
    options = webdriver.FirefoxOptions()
    options.set_headless()
    driver = webdriver.Firefox(firefox_options=options)

    for platform, boards in data.items():
        for board in boards:
            files = glob.glob(f'data/{board}*.csv')
            for thread in data[platform][board]:
                if f'data/{board}_{thread}.csv' in files:
                    print(f'Already scraped /{board}/{thread}...')
                    continue
                if platform == '4ch' or platform == '8kun':
                    continue

                try:
                    thread_scrape = pd.DataFrame(columns=['platform', 'board', 'thread_no', 'header', 'body'])
                    url = f"https://archive.is/newest/https://8ch.net/{board}/res/{thread}.html"
                    driver.get(url)
                    op = driver.find_element_by_css_selector('form > div:nth-of-type(1) > div:nth-of-type(2)')
                    comments = driver.find_elements_by_css_selector('form > div > div:nth-of-type(n+3)')

                    # OP
                    thread_scrape = thread_scrape.append({
                        'platform': '8chan',
                        'board': board,
                        'thread_no': thread,
                        'header': op.find_element_by_css_selector('div:nth-of-type(1)').get_attribute('innerHTML'),
                        'body': op.find_element_by_css_selector('div:nth-of-type(2)').get_attribute('innerHTML')
                    }, ignore_index=True)

                    # Comments
                    for comment in comments:
                        thread_scrape = thread_scrape.append({
                            'platform': '8chan',
                            'board': board,
                            'thread_no': thread,
                            'header': comment.find_element_by_css_selector('div:nth-of-type(1)').get_attribute('innerHTML'),
                            'body': comment.find_element_by_css_selector('div:nth-of-type(3)').get_attribute('innerHTML')
                    }, ignore_index=True)

                    print('Scraped {} posts from {}/{}...'.format(len(thread_scrape), board, thread))
                    thread_scrape.to_csv('data/{}_{}.csv'.format(board, thread))
                    print('Wrote {}_{}.csv'.format(board, thread))

                    if randrange(6) == 0:
                        print('Randomly waiting 10 seconds...')
                        time.sleep(10)
                    else:
                        time.sleep(randrange(1, 4))

                except Exception as e:
                    print('Exception scraping {}/{}...'.format(board, thread))
                    print(e)
                    try:
                        # Did we get a Captcha redirect?
                        captcha = driver.find_element_by_css_selector('h2 span:nth-of-type(1)')
                        if 'Please complete the security check' in captcha.text:
                            driver.quit()
                            print('Got Captcha redirect; restarting...')
                            time.sleep(20)
                            return False

                    except Exception as e:
                        # Not Captcha
                        pass
    return True

done = False

while not done:
    done = do_scrape()
    time.sleep(1)
	import pandas as pd
	from selenium import webdriver
	import json
	import glob
	from random import randrange
	import time


	with open("qthreads.json") as json_file:
	data = json.load(json_file)
	# If you want to split the job, do something like this:
	# data['8ch']['qresearch'] = data['8ch']['qresearch'][:569]


	def do_scrape():
	options = webdriver.FirefoxOptions()
	options.set_headless()
	driver = webdriver.Firefox(firefox_options=options)

	for platform, boards in data.items():
	for board in boards:
	files = glob.glob(f'data/{board}*.csv')
	for thread in data[platform][board]:
	if f'data/{board}_{thread}.csv' in files:
	print(f'Already scraped /{board}/{thread}...')
	continue
	if platform == '4ch' or platform == '8kun':
	continue

	try:
	thread_scrape = pd.DataFrame(columns=['platform', 'board', 'thread_no', 'header', 'body'])
	url = f"https://archive.is/newest/https://8ch.net/{board}/res/{thread}.html"
	driver.get(url)
	op = driver.find_element_by_css_selector('form > div:nth-of-type(1) > div:nth-of-type(2)')
	comments = driver.find_elements_by_css_selector('form > div > div:nth-of-type(n+3)')

	# OP
	thread_scrape = thread_scrape.append({
	'platform': '8chan',
	'board': board,
	'thread_no': thread,
	'header': op.find_element_by_css_selector('div:nth-of-type(1)').get_attribute('innerHTML'),
	'body': op.find_element_by_css_selector('div:nth-of-type(2)').get_attribute('innerHTML')
	}, ignore_index=True)

	# Comments
	for comment in comments:
	thread_scrape = thread_scrape.append({
	'platform': '8chan',
	'board': board,
	'thread_no': thread,
	'header': comment.find_element_by_css_selector('div:nth-of-type(1)').get_attribute('innerHTML'),
	'body': comment.find_element_by_css_selector('div:nth-of-type(3)').get_attribute('innerHTML')
	}, ignore_index=True)

	print('Scraped {} posts from {}/{}...'.format(len(thread_scrape), board, thread))
	thread_scrape.to_csv('data/{}_{}.csv'.format(board, thread))
	print('Wrote {}_{}.csv'.format(board, thread))

	if randrange(6) == 0:
	print('Randomly waiting 10 seconds...')
	time.sleep(10)
	else:
	time.sleep(randrange(1, 4))

	except Exception as e:
	print('Exception scraping {}/{}...'.format(board, thread))
	print(e)
	try:
	# Did we get a Captcha redirect?
	captcha = driver.find_element_by_css_selector('h2 span:nth-of-type(1)')
	if 'Please complete the security check' in captcha.text:
	driver.quit()
	print('Got Captcha redirect; restarting...')
	time.sleep(20)
	return False

	except Exception as e:
	# Not Captcha
	pass
	return True

	done = False

	while not done:
	done = do_scrape()
	time.sleep(1)