-
-
Save dav009/cea69eeddb3ec26c51bd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from scrapely import Scraper | |
chunk1 = "More than 1.4 million people at any given time suffer from healthcare-associated infections" | |
chunk2 = "SSIs account for 14 percent of all" | |
url = "http://www.molnlycke.com/solutions/infection-prevention-surgery/costs-microbial-infections/" | |
url2 = "http://www.molnlycke.com/news-media/infection-prevention/barrier-scrub-suit-purple/" | |
chunk3 = "The BARRIER scrub suit family" | |
chunk4 = "We all know that when it comes to what we wear" | |
def add_sample(scraper, url, paragraph1, paragraph2): | |
scraper.train(url, {"p1": paragraph1, "p2": paragraph2}, encoding="utf8") | |
# annotating url and then url2 | |
s = Scraper() | |
add_sample(s, url, chunk1, chunk2) | |
# It does its job!, happy town | |
print("\nscraping url") | |
print(s.scrape(url)) | |
add_sample(s, url2, chunk3, chunk4) | |
# It does its job!, happy town | |
print("\nscraping url") | |
print(s.scrape(url)) | |
# why is it empty? | |
print("\nscraping url2") | |
print(s.scrape(url2)) | |
print("\n-----training first on url2 and then url------") | |
# annotating url2 and then url | |
s = Scraper() | |
add_sample(s, url2, chunk3, chunk4) | |
# It does its job!, happy town | |
print("\nscraping url2") | |
print(s.scrape(url2)) | |
add_sample(s, url, chunk1, chunk2) | |
# why is it empty? | |
print("\nscraping url") | |
print(s.scrape(url)) | |
# It does its job!, happy town | |
print("\nscraping url2") | |
print(s.scrape(url2)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment