Skip to content

Instantly share code, notes, and snippets.

@vik-y
Last active November 14, 2022 12:32
Show Gist options
  • Save vik-y/b3b55534c9d65140f0c47d14c305e905 to your computer and use it in GitHub Desktop.
Save vik-y/b3b55534c9d65140f0c47d14c305e905 to your computer and use it in GitHub Desktop.
Autoscraper Example
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from autoscraper import AutoScraper\n",
"\n",
"scraper = AutoScraper()\n",
"\n",
"url = \"https://www.mca.gov.in/MinistryV2/efiling.html\"\n",
"\n",
"wanted_list = ['Can I apply for a company name online?', 'Yes, you can avail the RUN service at MCA portal for reserving a name online']\n",
"\n",
"\n",
"result = scraper.build(url, wanted_list)\n",
"\n",
"rules = scraper.get_result_similar(url, grouped=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"scraper.set_rule_aliases({'rule_roo8': 'questions', 'rule_q5wq': 'answers'})\n",
"scraper.keep_rules(['rule_roo8', 'rule_q5wq'])\n",
"\n",
"scraper.save('mca')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.8 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
from autoscraper import AutoScraper
from typing import List
scraper = AutoScraper()
url = 'https://www.mca.gov.in/MinistryV2/efiling.html'
scraper.load('mca')
data = scraper.get_result_similar(url, group_by_alias=True)
questions = data['questions']
answers = data['answers']
class QnA:
def __init__(self, question, answer):
self.question = question
self.answer = answer
def __str__(self):
return f'Q: {self.question}\n>> A: {self.answer}\n'
def __repr__(self):
return self.__str__()
faqs: List[QnA] = []
for i in range(len(questions)):
faqs.append(QnA(questions[i], answers[i]))
# write to file
with open('faqs.txt', 'w') as f:
for faq in faqs:
f.write(f'{faq}\n'+'-'*50+'\n')
{"stack_list": [{"content": [["[document]", {"style": "", "class": ""}, 0], ["html", {"class": ["no-js"], "style": ""}, 0], ["body", {"style": "", "class": ""}, 0], ["div", {"class": ["widthcommon", "center"], "style": ""}, 0], ["div", {"class": ["center"], "style": ""}, 0], ["div", {"class": ["rightInnerContent", "floatRight"], "style": ""}, 0], ["div", {"class": ["faq", "floatLeft"], "style": ""}, 0], ["ul", {"style": "", "class": ""}, 1], ["li", {"style": "", "class": ""}, 0], ["h1", {"class": ["floatLeft"], "style": ""}]], "wanted_attr": null, "is_full_url": null, "is_non_rec_text": null, "url": "", "hash": "a854af3a468bd34ba08f3a4618906b06ec8f67041d3310e436d463ab9ee6dfb0", "stack_id": "rule_roo8", "alias": "questions"}, {"content": [["[document]", {"style": "", "class": ""}, 0], ["html", {"class": ["no-js"], "style": ""}, 0], ["body", {"style": "", "class": ""}, 0], ["div", {"class": ["widthcommon", "center"], "style": ""}, 0], ["div", {"class": ["center"], "style": ""}, 0], ["div", {"class": ["rightInnerContent", "floatRight"], "style": ""}, 0], ["div", {"class": ["faq", "floatLeft"], "style": ""}, 0], ["ul", {"style": "", "class": ""}, 1], ["li", {"style": "", "class": ""}, 0], ["div", {"style": "display: none;", "class": ""}]], "wanted_attr": null, "is_full_url": null, "is_non_rec_text": null, "url": "", "hash": "10c1534cfca1945249c417b6cc9842baeb766c0a5afd2719a0cfb66e0e03717a", "stack_id": "rule_q5wq", "alias": "answers"}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment