Last active
January 14, 2023 17:04
-
-
Save bitsnaps/654315ebbbb4bbe55b6fe5d0ccc4ab7a to your computer and use it in GitHub Desktop.
Scraping results from Quiz And Survey Master wordpress plugin using python3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, csv, requests, datetime, urllib, re | |
from bs4 import BeautifulSoup | |
# from dateutil.parser import parse # if you want to parse datetime | |
# This script allows you to grab results from your wordpress website if you're using "Quiz And Survey" plugin. | |
wp_host = 'https://your-wordpress-website.com' | |
wp_login = f"{wp_host}/login" | |
wp_admin = f"{wp_host}/wp-admin/" | |
wp_quiz = f"{wp_host}/wp-admin/admin.php?page=mlw_quiz_results&qmn_order_by=time_taken_real" | |
username = 'YOUR_USER_NAME' | |
password = 'YOUR_PASSWORD' | |
quiz_params = { 'title': 'NAME OF THE QUIZ', | |
'fields': ['Quiz Name', 'Name', 'Email', 'Phone', 'Answers' ... (COMPETE THIS ARRAY WITH THE FIELDS YOU WANT)] } | |
def save_csv(filename, data): | |
write_mode = 'a' if os.path.exists(filename) else 'w' # make a new file if not | |
with open(filename, mode=write_mode) as csv_file: | |
data_writer = csv.writer(csv_file, delimiter=';', quotechar='"') | |
if write_mode == 'w': | |
data_writer.writerow(quiz_params['fields']) | |
data_writer.writerow(data) | |
def parse_url(session, url, title, result_file, datetime_result): | |
# print('parsing url: ' + url) | |
resp = session.get(url) | |
if resp.status_code==200: | |
data = [] | |
data.append(title) | |
soup = BeautifulSoup(resp.content, features="html.parser") | |
# Overview Section | |
overview = soup.find_all(class_='overview-main-wrapper')[0] | |
div = overview.find_all(id='submitdiv')[0] | |
spans = div.find_all(class_='result-candidate-span') | |
for span in spans: | |
data.append(span.find('span').text.strip()) | |
# Comments Section (hidden) | |
comments = soup.find_all(class_='comment-inner-wrap') | |
if (comments): | |
first_comment = soup.find_all(class_='comment-inner-wrap')[0] | |
div = first_comment.find_all(id='submitdiv')[0] | |
comment_content = div.find_all(class_='inside')[0] | |
# print(comment_content.text.strip()) # comment | |
data.append(comment_content.text.strip()) | |
# Responses Section | |
responses_wrap = soup.find_all(class_='response-inner-wrap')[0] | |
responses = responses_wrap.find_all(class_='qmn_question_answer') | |
survey = [] | |
for response in responses: | |
r = response.find_all(class_='mlw_qmn_question')[0] | |
question = r.find_all('b')[0] | |
survey.append(question.text.strip()) | |
answer = r.find_all('span')[0] | |
survey.append(answer.text.strip()) | |
# print(';'.join(survey)) | |
data.append(';'.join(survey)) | |
data.append(datetime_result) | |
save_csv(result_file, data) | |
else: | |
print('Error: status code: '+resp.status_code) | |
def parse_page(soup, result_file): | |
tbody_list = soup.find_all('tbody', id='the-list')[0] #<tbody> | |
trs = tbody_list.find_all('tr') | |
print(f"Nbr of rows in the page: {len(trs)}") | |
for tr in trs: | |
# tr.find_all('td')[0] # checkbox | |
rows = tr.find_all('td') | |
# print('Quiz Name: ', rows[1].text.strip()) | |
# print('Name: ', rows[4].text.strip()) | |
# print('Email: ', rows[6].text.strip()) | |
td_quiz_name = rows[1] # Quiz Name | |
datetime_result = datetime.datetime.now() | |
user = rows[8] # User/DateTime taken | |
if (user.find('abbr')): | |
datetime_result = user.find('abbr')['title'] | |
datetime_taken = rows[9] # User/DateTime taken | |
if (datetime_taken.find('abbr')): | |
datetime_result = datetime_taken.find('abbr')['title'] | |
title = td_quiz_name.find_all('span')[0].text | |
show_link = td_quiz_name.find_all('a', href=True)[0] | |
url = urllib.parse.urljoin(wp_admin, show_link['href']) | |
# print(parse(datetime_result)) | |
#if title == quiz_params['title']: | |
parse_url(s, url, title, result_file, datetime_result) | |
def query_parse_page(session, url_page, num_page): | |
result_file = 'results_'+datetime.datetime.now().strftime('%d_%m_%Y-%H_%M_%S')+'_page'+ str(num_page+1)+'.csv' | |
resp = session.get(url_page) | |
if resp.status_code==200: | |
soup = BeautifulSoup(resp.content, features="html.parser") | |
parse_page(soup, result_file) | |
else: | |
print('Cannot find page: ', url_page) | |
with requests.Session() as s: | |
s.post(wp_login, data= {'log':username, 'pwd':password}) | |
# Encode title to URL format: | |
quiz_title_query = urllib.parse.urlencode( {'qsm_search_phrase': quiz_params['title'] }) | |
# Append quiz title to the URL: | |
wp_quiz = f"{wp_quiz}&{quiz_title_query}" | |
resp = s.get(wp_quiz) | |
soup = BeautifulSoup(resp.content, features="html.parser") | |
title = soup.find_all('h2')[0] | |
assert 'Quiz Results' in title.text.strip() | |
# Loop through pages: | |
#'div.tablenav-pages' > 'span.displaying-num' => '209 results' (need to be stripped) | |
num_results = soup.find_all('span', class_='displaying-num')[0].text.strip() | |
print("Nbr of results: ", num_results) | |
#'div.tablenav-pages' > 'span.pagination-links' > 'span.paging-input' => '1 of 6' | |
paging_num = soup.find_all('span', class_='paging-input') | |
# We first parse the current (first) page | |
result_file = 'results_'+datetime.datetime.now().strftime('%d_%m_%Y-%H_%M_%S')+'.csv' | |
parse_page(soup, result_file) | |
# We then go through pagination | |
if len(paging_num) > 0: # the page may has no pagination | |
paging_num = paging_num[0].text.strip() | |
# retrieve the number of available pages | |
regx_paging_num = re.match(r'(\d) of (\d)', paging_num) # parse '1 of 6' | |
nbr_pages = -1 | |
if regx_paging_num: | |
nbr_pages = regx_paging_num.group(2)# get 6 out of '1 of 6' | |
print('Nbr of pages: ', nbr_pages) | |
# Add to URL: &qsm_results_page=1 | |
# Final URL to the page N°1: f"{wp_quiz}&qmn_order_by=time_taken_real&qsm_search_phrase=Title+of+Quiz&qsm_results_page=0" | |
for num_page in range(0, int(nbr_pages) - 1): | |
url_page = f"{wp_quiz}&qsm_results_page={num_page}" | |
query_parse_page(s, url_page, num_page) | |
else: | |
print('Cannot find pagination, paging num: ', paging_num) | |
print('done.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment