Created
November 14, 2020 13:27
-
-
Save theotheo/810a3701689cdf2587e7ac4ca1a347e7 to your computer and use it in GitHub Desktop.
Грязный скрипт для скрепинга теста Постнауки про социологов https://postnauka.ru/tests/155870
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import time | |
import json | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.action_chains import ActionChains | |
from selenium.webdriver.support import expected_conditions | |
from selenium.webdriver.support.wait import WebDriverWait | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
from collections import defaultdict | |
import json | |
# %% | |
def n_answers(n): | |
answers_numbers = { | |
(2, 3, 4, 7): 5, | |
(1, 5, 6, 8, 10): 6, | |
(9, ): 7 | |
} | |
for answers, number in answers_numbers.items(): | |
if n in answers: | |
return number | |
def get_inc(): | |
inc = driver.find_element(By.CSS_SELECTOR, '.test__resource-change') | |
return inc.find_element(By.XPATH, '..').text[0] | |
# letter_answer = defaultdict(list) | |
# results = [] | |
normalized = [] | |
# %% | |
driver = webdriver.Chrome() | |
test_file = open('postnauka-test.jsonlines', 'a') | |
result_file = open('postnauka-results.jsonlines', 'a') | |
for _ in range(3): | |
for n_answer in range(0, 7): | |
print(n_answer) | |
driver.get("https://postnauka.ru/tests/155870") | |
driver.set_window_size(1533, 845) | |
driver.execute_script("window.scrollTo(0,136.25)") | |
driver.find_element(By.CSS_SELECTOR, ".start-test").click() | |
for n_question in range(1, 11): | |
max_answer = n_answers(n_question) | |
question = driver.find_element(By.CSS_SELECTOR, '.question-content') | |
answers = driver.find_elements(By.CSS_SELECTOR, '.answer') | |
n_answer = min(n_answer, max_answer-1) | |
# print(n_question, n_answer) | |
answer = answers[n_answer] | |
answer.click() | |
# letter_answer[get_inc()].append({question.text: answer.text}) | |
obs = {'letter': get_inc(), 'answer': answer.text, 'question': question.text} | |
normalized.append(obs) | |
test_file.write(json.dumps(obs)+'\n') | |
driver.find_element(By.CSS_SELECTOR, ".next-question").click() | |
time.sleep(0.5) | |
res = driver.find_element(By.CSS_SELECTOR, ".test__resource").text | |
text = driver.find_element(By.CSS_SELECTOR, ".test_bottom-finish").text | |
# results.append({res: text}) | |
result_file.write(json.dumps({res: text}) + '\n') | |
print({res: text}) | |
driver.quit() | |
# %% | |
import pandas as pd | |
df = pd.read_json('~/Downloads/postnauka-test.jsonlines', lines=True) | |
# df['answer'] = df['answer'].str[0:-3] | |
df['answer'] = df['answer'].str.extract('^(.*)\n?') | |
df = df.drop_duplicates() | |
df = df.pivot(index='question', columns='letter') | |
df.to_excel('postnauka.xlsx') | |
# %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment