-
-
Save ApprenticeGC/908c1672f43ac46277fbaa6816406793 to your computer and use it in GitHub Desktop.
Extract board data by BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from bs4 import BeautifulSoup | |
from bs4 import CData | |
from bs4.element import CData | |
from tinydb import TinyDB, where | |
from datetime import datetime as dt | |
# params = {'130_1': ''} | |
# r = requests.get('https://sokoban.info/', params) | |
# soup = BeautifulSoup(html_doc, 'html.parser') | |
# soup = BeautifulSoup(r.text, 'html.parser') | |
def use_db(): | |
print("use db") | |
db = TinyDB("data.json") | |
overview_table = db.table('Overview') | |
# overview_table.search(Query()[]) | |
no_item_in = overview_table.count(where('extracted') == True) == 0 | |
stages = [] | |
if no_item_in: | |
print('has no data') | |
stages = extract_stage() | |
overview_table.insert({ | |
"dateTime": dt.now().strftime("%Y/%m/%d %H:%M:%S"), | |
"extracted": True, | |
"stages": stages | |
}) | |
else: | |
print('has data, can just proceed') | |
# processed_table = db.table('Processed') | |
can_process_amount = 20 | |
current_process_amount = 0 | |
extracted_items = overview_table.all() | |
document = extracted_items[0] | |
stages = document['stages'] | |
for stage in stages: | |
stage_title = stage['title'] | |
stage_id = stage['index'] | |
stage_amount = int(stage['amount']) | |
stage_table = db.table(stage_title) | |
current_stage_to_be_processed = {} | |
for i in range(0, stage_amount): | |
current_stage_to_be_processed[i + 1] = { | |
'index': i + 1 | |
} | |
all_stage_documents = stage_table.all() | |
processed_stages = map(lambda x: x['index'], all_stage_documents) | |
to_be_popped_later = [] | |
for p_stage in processed_stages: | |
print("processed stage") | |
print(p_stage) | |
# current_stage_to_be_processed.pop(str(p_stage['index']), None) | |
to_be_popped_later.append(p_stage) | |
for popping in to_be_popped_later: | |
print("popping") | |
print(popping) | |
current_stage_to_be_processed.pop(popping, None) | |
# | |
# print(stage_title) | |
for to_process in current_stage_to_be_processed: | |
if current_process_amount < can_process_amount: | |
combined = stage_id + '_' + str(to_process) | |
# print(combined) | |
params = {combined: ''} | |
r = requests.get('https://sokoban.info/', params) | |
soup = BeautifulSoup(r.text, 'html.parser') | |
board_result = extract_board(soup) | |
stage_table.insert({ | |
'index': to_process, | |
'board': board_result | |
}) | |
current_process_amount += 1 | |
def extract_stage(): | |
print("extract stage") | |
stages = [] | |
options = soup.find_all("option") | |
for o in options: | |
index = o['value'] | |
content = o.string | |
pattern = "\([0-9]*\)" | |
amount_with_parentheses = re.search(pattern, content).group(0) | |
amount = re.sub(r"\(|\)", '', amount_with_parentheses) | |
title = re.sub(r" \([0-9]*\)", '', content) | |
stages.append({ | |
'index': index, | |
'title': title, | |
'amount': int(amount) | |
}) | |
return stages | |
def extract_board(soup): | |
scripts = soup.find_all("script", string=re.compile("var Board")) | |
board_result = {} | |
rows = [] | |
for s in scripts: | |
content = s.string | |
split_lines = content.split('\n') | |
for sl in split_lines: | |
result = re.match(r"\s*var Board\s*=", sl) | |
if result: | |
stripped = re.sub(r"\s*var Board\s*=", '', sl) | |
remove_space_and_semicolon = re.sub(r"\"\s*;", '', stripped) | |
remove_first_quote = re.sub(r"\"", '', remove_space_and_semicolon) | |
with_end_char_rows = remove_first_quote.split('!') | |
for row in with_end_char_rows: | |
clean_row = re.sub(r"!", '', row) | |
# print(clean_row) | |
rows.append(clean_row) | |
board_result = { | |
'rows': rows | |
} | |
return board_result | |
# Press the green button in the gutter to run the script. | |
if __name__ == '__main__': | |
use_db() | |
# stages = extract_stage() | |
# for s in stages: | |
# print(s) | |
# extract_board() | |
# See PyCharm help at https://www.jetbrains.com/help/pycharm/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment