ApprenticeGC/main.py Secret

## main.py
import requests

import re
from bs4 import BeautifulSoup
from bs4 import CData
from bs4.element import CData

from tinydb import TinyDB, where
from datetime import datetime as dt

# params = {'130_1': ''}
# r = requests.get('https://sokoban.info/', params)

# soup = BeautifulSoup(html_doc, 'html.parser')
# soup = BeautifulSoup(r.text, 'html.parser')

def use_db():
    print("use db")
    db = TinyDB("data.json")
    overview_table = db.table('Overview')

    # overview_table.search(Query()[])
    no_item_in = overview_table.count(where('extracted') == True) == 0
    stages = []
    if no_item_in:
        print('has no data')
        stages = extract_stage()
        overview_table.insert({
            "dateTime": dt.now().strftime("%Y/%m/%d %H:%M:%S"),
            "extracted": True,
            "stages": stages
        })
    else:
        print('has data, can just proceed')

    # processed_table = db.table('Processed')
    can_process_amount = 20
    current_process_amount = 0

    extracted_items = overview_table.all()
    document = extracted_items[0]
    stages = document['stages']
    for stage in stages:
        stage_title = stage['title']
        stage_id = stage['index']
        stage_amount = int(stage['amount'])
        stage_table = db.table(stage_title)

        current_stage_to_be_processed = {}
        for i in range(0, stage_amount):
            current_stage_to_be_processed[i + 1] = {
              'index': i + 1
            }

        all_stage_documents = stage_table.all()
        processed_stages = map(lambda x: x['index'], all_stage_documents)
        to_be_popped_later = []
        for p_stage in processed_stages:
            print("processed stage")
            print(p_stage)
            # current_stage_to_be_processed.pop(str(p_stage['index']), None)
            to_be_popped_later.append(p_stage)
        for popping in to_be_popped_later:
            print("popping")
            print(popping)
            current_stage_to_be_processed.pop(popping, None)

        #
        # print(stage_title)
        for to_process in current_stage_to_be_processed:
            if current_process_amount < can_process_amount:
                combined = stage_id + '_' + str(to_process)
                # print(combined)
                params = {combined: ''}
                r = requests.get('https://sokoban.info/', params)
                soup = BeautifulSoup(r.text, 'html.parser')
                board_result = extract_board(soup)
                stage_table.insert({
                    'index': to_process,
                    'board': board_result
                })
                current_process_amount += 1


def extract_stage():
    print("extract stage")

    stages = []

    options = soup.find_all("option")
    for o in options:
        index = o['value']
        content = o.string
        pattern = "\([0-9]*\)"
        amount_with_parentheses = re.search(pattern, content).group(0)
        amount = re.sub(r"\(|\)", '', amount_with_parentheses)
        title = re.sub(r"   \([0-9]*\)", '', content)

        stages.append({
          'index': index,
          'title': title,
          'amount': int(amount)
        })

    return stages


def extract_board(soup):
    scripts = soup.find_all("script", string=re.compile("var Board"))
    board_result = {}
    rows = []
    for s in scripts:
        content = s.string
        split_lines = content.split('\n')
        for sl in split_lines:
            result = re.match(r"\s*var Board\s*=", sl)
            if result:
                stripped = re.sub(r"\s*var Board\s*=", '', sl)
                remove_space_and_semicolon = re.sub(r"\"\s*;", '', stripped)
                remove_first_quote = re.sub(r"\"", '', remove_space_and_semicolon)
                with_end_char_rows = remove_first_quote.split('!')
                for row in with_end_char_rows:
                    clean_row = re.sub(r"!", '', row)
                    # print(clean_row)
                    rows.append(clean_row)

                board_result = {
                    'rows': rows
                }

    return board_result


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    use_db()
    # stages = extract_stage()
    # for s in stages:
    #     print(s)
    # extract_board()

# See PyCharm help at https://www.jetbrains.com/help/pycharm/
	import requests

	import re
	from bs4 import BeautifulSoup
	from bs4 import CData
	from bs4.element import CData

	from tinydb import TinyDB, where
	from datetime import datetime as dt

	# params = {'130_1': ''}
	# r = requests.get('https://sokoban.info/', params)

	# soup = BeautifulSoup(html_doc, 'html.parser')
	# soup = BeautifulSoup(r.text, 'html.parser')

	def use_db():
	print("use db")
	db = TinyDB("data.json")
	overview_table = db.table('Overview')

	# overview_table.search(Query()[])
	no_item_in = overview_table.count(where('extracted') == True) == 0
	stages = []
	if no_item_in:
	print('has no data')
	stages = extract_stage()
	overview_table.insert({
	"dateTime": dt.now().strftime("%Y/%m/%d %H:%M:%S"),
	"extracted": True,
	"stages": stages
	})
	else:
	print('has data, can just proceed')

	# processed_table = db.table('Processed')
	can_process_amount = 20
	current_process_amount = 0

	extracted_items = overview_table.all()
	document = extracted_items[0]
	stages = document['stages']
	for stage in stages:
	stage_title = stage['title']
	stage_id = stage['index']
	stage_amount = int(stage['amount'])
	stage_table = db.table(stage_title)

	current_stage_to_be_processed = {}
	for i in range(0, stage_amount):
	current_stage_to_be_processed[i + 1] = {
	'index': i + 1
	}

	all_stage_documents = stage_table.all()
	processed_stages = map(lambda x: x['index'], all_stage_documents)
	to_be_popped_later = []
	for p_stage in processed_stages:
	print("processed stage")
	print(p_stage)
	# current_stage_to_be_processed.pop(str(p_stage['index']), None)
	to_be_popped_later.append(p_stage)
	for popping in to_be_popped_later:
	print("popping")
	print(popping)
	current_stage_to_be_processed.pop(popping, None)

	#
	# print(stage_title)
	for to_process in current_stage_to_be_processed:
	if current_process_amount < can_process_amount:
	combined = stage_id + '_' + str(to_process)
	# print(combined)
	params = {combined: ''}
	r = requests.get('https://sokoban.info/', params)
	soup = BeautifulSoup(r.text, 'html.parser')
	board_result = extract_board(soup)
	stage_table.insert({
	'index': to_process,
	'board': board_result
	})
	current_process_amount += 1


	def extract_stage():
	print("extract stage")

	stages = []

	options = soup.find_all("option")
	for o in options:
	index = o['value']
	content = o.string
	pattern = "\([0-9]*\)"
	amount_with_parentheses = re.search(pattern, content).group(0)
	amount = re.sub(r"\(\|\)", '', amount_with_parentheses)
	title = re.sub(r" \([0-9]*\)", '', content)

	stages.append({
	'index': index,
	'title': title,
	'amount': int(amount)
	})

	return stages


	def extract_board(soup):
	scripts = soup.find_all("script", string=re.compile("var Board"))
	board_result = {}
	rows = []
	for s in scripts:
	content = s.string
	split_lines = content.split('\n')
	for sl in split_lines:
	result = re.match(r"\svar Board\s=", sl)
	if result:
	stripped = re.sub(r"\svar Board\s=", '', sl)
	remove_space_and_semicolon = re.sub(r"\"\s*;", '', stripped)
	remove_first_quote = re.sub(r"\"", '', remove_space_and_semicolon)
	with_end_char_rows = remove_first_quote.split('!')
	for row in with_end_char_rows:
	clean_row = re.sub(r"!", '', row)
	# print(clean_row)
	rows.append(clean_row)

	board_result = {
	'rows': rows
	}

	return board_result


	# Press the green button in the gutter to run the script.
	if __name__ == '__main__':
	use_db()
	# stages = extract_stage()
	# for s in stages:
	# print(s)
	# extract_board()

	# See PyCharm help at https://www.jetbrains.com/help/pycharm/