Skip to content

Instantly share code, notes, and snippets.

@gdamjan
Last active June 6, 2020 20:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gdamjan/dcfd923948d4d613ed2678bbbd38e4db to your computer and use it in GitHub Desktop.
Save gdamjan/dcfd923948d4d613ed2678bbbd38e4db to your computer and use it in GitHub Desktop.
import lxml.html
odluki = 'http://ustavensud.mk/?cat=82'
def iterate_pages_from_category(start_page):
page_url = start_page
while True:
p = lxml.html.parse(page_url)
root = p.getroot()
yield from [ el.attrib['href'] for el in root.cssselect('h3.gdlr-blog-title a') ]
next_url = root.cssselect('a.page-numbers.next')
if len(next_url) == 0:
return
page_url = next_url[0].attrib['href']
pages = list(iterate_pages_from_category(odluki))
# find duplicates (!?)
from collections import Counter
cnt = Counter(pages)
print([item for item, count in cnt.items() if count > 1])
import lxml.html
import requests
odluki = 'http://ustavensud.mk/?cat=82'
def iterate_pages_from_category(start_page):
page_url = start_page
session = requests.Session()
while True:
resp = session.get(page_url)
root = lxml.html.fromstring(resp.content, base_url=page_url)
yield from [ el.attrib['href'] for el in root.cssselect('h3.gdlr-blog-title a') ]
next_url = root.cssselect('a.page-numbers.next')
if len(next_url) == 0:
return
page_url = next_url[0].attrib['href']
pages = list(iterate_pages_from_category(odluki))
# find duplicates (!?)
from collections import Counter
cnt = Counter(pages)
print([item for item, count in cnt.items() if count > 1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment