Skip to content

Instantly share code, notes, and snippets.

@vik-y
Last active November 14, 2022 12:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vik-y/b3b55534c9d65140f0c47d14c305e905 to your computer and use it in GitHub Desktop.
Save vik-y/b3b55534c9d65140f0c47d14c305e905 to your computer and use it in GitHub Desktop.
Autoscraper Example
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from autoscraper import AutoScraper
from typing import List
scraper = AutoScraper()
url = 'https://www.mca.gov.in/MinistryV2/efiling.html'
scraper.load('mca')
data = scraper.get_result_similar(url, group_by_alias=True)
questions = data['questions']
answers = data['answers']
class QnA:
def __init__(self, question, answer):
self.question = question
self.answer = answer
def __str__(self):
return f'Q: {self.question}\n>> A: {self.answer}\n'
def __repr__(self):
return self.__str__()
faqs: List[QnA] = []
for i in range(len(questions)):
faqs.append(QnA(questions[i], answers[i]))
# write to file
with open('faqs.txt', 'w') as f:
for faq in faqs:
f.write(f'{faq}\n'+'-'*50+'\n')
{"stack_list": [{"content": [["[document]", {"style": "", "class": ""}, 0], ["html", {"class": ["no-js"], "style": ""}, 0], ["body", {"style": "", "class": ""}, 0], ["div", {"class": ["widthcommon", "center"], "style": ""}, 0], ["div", {"class": ["center"], "style": ""}, 0], ["div", {"class": ["rightInnerContent", "floatRight"], "style": ""}, 0], ["div", {"class": ["faq", "floatLeft"], "style": ""}, 0], ["ul", {"style": "", "class": ""}, 1], ["li", {"style": "", "class": ""}, 0], ["h1", {"class": ["floatLeft"], "style": ""}]], "wanted_attr": null, "is_full_url": null, "is_non_rec_text": null, "url": "", "hash": "a854af3a468bd34ba08f3a4618906b06ec8f67041d3310e436d463ab9ee6dfb0", "stack_id": "rule_roo8", "alias": "questions"}, {"content": [["[document]", {"style": "", "class": ""}, 0], ["html", {"class": ["no-js"], "style": ""}, 0], ["body", {"style": "", "class": ""}, 0], ["div", {"class": ["widthcommon", "center"], "style": ""}, 0], ["div", {"class": ["center"], "style": ""}, 0], ["div", {"class": ["rightInnerContent", "floatRight"], "style": ""}, 0], ["div", {"class": ["faq", "floatLeft"], "style": ""}, 0], ["ul", {"style": "", "class": ""}, 1], ["li", {"style": "", "class": ""}, 0], ["div", {"style": "display: none;", "class": ""}]], "wanted_attr": null, "is_full_url": null, "is_non_rec_text": null, "url": "", "hash": "10c1534cfca1945249c417b6cc9842baeb766c0a5afd2719a0cfb66e0e03717a", "stack_id": "rule_q5wq", "alias": "answers"}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment