Skip to content

Instantly share code, notes, and snippets.

@theotheo
Created November 14, 2020 13:27
Show Gist options
  • Save theotheo/810a3701689cdf2587e7ac4ca1a347e7 to your computer and use it in GitHub Desktop.
Save theotheo/810a3701689cdf2587e7ac4ca1a347e7 to your computer and use it in GitHub Desktop.
Грязный скрипт для скрепинга теста Постнауки про социологов https://postnauka.ru/tests/155870
# %%
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from collections import defaultdict
import json
# %%
def n_answers(n):
answers_numbers = {
(2, 3, 4, 7): 5,
(1, 5, 6, 8, 10): 6,
(9, ): 7
}
for answers, number in answers_numbers.items():
if n in answers:
return number
def get_inc():
inc = driver.find_element(By.CSS_SELECTOR, '.test__resource-change')
return inc.find_element(By.XPATH, '..').text[0]
# letter_answer = defaultdict(list)
# results = []
normalized = []
# %%
driver = webdriver.Chrome()
test_file = open('postnauka-test.jsonlines', 'a')
result_file = open('postnauka-results.jsonlines', 'a')
for _ in range(3):
for n_answer in range(0, 7):
print(n_answer)
driver.get("https://postnauka.ru/tests/155870")
driver.set_window_size(1533, 845)
driver.execute_script("window.scrollTo(0,136.25)")
driver.find_element(By.CSS_SELECTOR, ".start-test").click()
for n_question in range(1, 11):
max_answer = n_answers(n_question)
question = driver.find_element(By.CSS_SELECTOR, '.question-content')
answers = driver.find_elements(By.CSS_SELECTOR, '.answer')
n_answer = min(n_answer, max_answer-1)
# print(n_question, n_answer)
answer = answers[n_answer]
answer.click()
# letter_answer[get_inc()].append({question.text: answer.text})
obs = {'letter': get_inc(), 'answer': answer.text, 'question': question.text}
normalized.append(obs)
test_file.write(json.dumps(obs)+'\n')
driver.find_element(By.CSS_SELECTOR, ".next-question").click()
time.sleep(0.5)
res = driver.find_element(By.CSS_SELECTOR, ".test__resource").text
text = driver.find_element(By.CSS_SELECTOR, ".test_bottom-finish").text
# results.append({res: text})
result_file.write(json.dumps({res: text}) + '\n')
print({res: text})
driver.quit()
# %%
import pandas as pd
df = pd.read_json('~/Downloads/postnauka-test.jsonlines', lines=True)
# df['answer'] = df['answer'].str[0:-3]
df['answer'] = df['answer'].str.extract('^(.*)\n?')
df = df.drop_duplicates()
df = df.pivot(index='question', columns='letter')
df.to_excel('postnauka.xlsx')
# %%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment