Skip to content

Instantly share code, notes, and snippets.

@ApprenticeGC
Last active March 16, 2021 03:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ApprenticeGC/908c1672f43ac46277fbaa6816406793 to your computer and use it in GitHub Desktop.
Save ApprenticeGC/908c1672f43ac46277fbaa6816406793 to your computer and use it in GitHub Desktop.
Extract board data by BeautifulSoup
import requests
import re
from bs4 import BeautifulSoup
from bs4 import CData
from bs4.element import CData
from tinydb import TinyDB, where
from datetime import datetime as dt
# params = {'130_1': ''}
# r = requests.get('https://sokoban.info/', params)
# soup = BeautifulSoup(html_doc, 'html.parser')
# soup = BeautifulSoup(r.text, 'html.parser')
def use_db():
print("use db")
db = TinyDB("data.json")
overview_table = db.table('Overview')
# overview_table.search(Query()[])
no_item_in = overview_table.count(where('extracted') == True) == 0
stages = []
if no_item_in:
print('has no data')
stages = extract_stage()
overview_table.insert({
"dateTime": dt.now().strftime("%Y/%m/%d %H:%M:%S"),
"extracted": True,
"stages": stages
})
else:
print('has data, can just proceed')
# processed_table = db.table('Processed')
can_process_amount = 20
current_process_amount = 0
extracted_items = overview_table.all()
document = extracted_items[0]
stages = document['stages']
for stage in stages:
stage_title = stage['title']
stage_id = stage['index']
stage_amount = int(stage['amount'])
stage_table = db.table(stage_title)
current_stage_to_be_processed = {}
for i in range(0, stage_amount):
current_stage_to_be_processed[i + 1] = {
'index': i + 1
}
all_stage_documents = stage_table.all()
processed_stages = map(lambda x: x['index'], all_stage_documents)
to_be_popped_later = []
for p_stage in processed_stages:
print("processed stage")
print(p_stage)
# current_stage_to_be_processed.pop(str(p_stage['index']), None)
to_be_popped_later.append(p_stage)
for popping in to_be_popped_later:
print("popping")
print(popping)
current_stage_to_be_processed.pop(popping, None)
#
# print(stage_title)
for to_process in current_stage_to_be_processed:
if current_process_amount < can_process_amount:
combined = stage_id + '_' + str(to_process)
# print(combined)
params = {combined: ''}
r = requests.get('https://sokoban.info/', params)
soup = BeautifulSoup(r.text, 'html.parser')
board_result = extract_board(soup)
stage_table.insert({
'index': to_process,
'board': board_result
})
current_process_amount += 1
def extract_stage():
print("extract stage")
stages = []
options = soup.find_all("option")
for o in options:
index = o['value']
content = o.string
pattern = "\([0-9]*\)"
amount_with_parentheses = re.search(pattern, content).group(0)
amount = re.sub(r"\(|\)", '', amount_with_parentheses)
title = re.sub(r"   \([0-9]*\)", '', content)
stages.append({
'index': index,
'title': title,
'amount': int(amount)
})
return stages
def extract_board(soup):
scripts = soup.find_all("script", string=re.compile("var Board"))
board_result = {}
rows = []
for s in scripts:
content = s.string
split_lines = content.split('\n')
for sl in split_lines:
result = re.match(r"\s*var Board\s*=", sl)
if result:
stripped = re.sub(r"\s*var Board\s*=", '', sl)
remove_space_and_semicolon = re.sub(r"\"\s*;", '', stripped)
remove_first_quote = re.sub(r"\"", '', remove_space_and_semicolon)
with_end_char_rows = remove_first_quote.split('!')
for row in with_end_char_rows:
clean_row = re.sub(r"!", '', row)
# print(clean_row)
rows.append(clean_row)
board_result = {
'rows': rows
}
return board_result
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
use_db()
# stages = extract_stage()
# for s in stages:
# print(s)
# extract_board()
# See PyCharm help at https://www.jetbrains.com/help/pycharm/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment