Created
May 6, 2020 05:49
-
-
Save satellitex/bc2fa1c53bc9b8f2a29e8f98fe339d6d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import time | |
import sys | |
BASE_URL = 'https://with.is' | |
SEARCH_URL = BASE_URL + '/search' | |
SLEEP_TIME = 5 | |
GET_URLS = re.compile(r'<a\s+class="link-area\s+needsclick\s+append-anchor"[^>]+>') | |
MATCH_URL = re.compile('href="(.+)">') | |
if len(sys.argv) <= 3: | |
token = sys.argv[1] | |
cookie = sys.argv[2] | |
else: | |
token = "<your_token>" | |
cookie = "s=<your_cookie_token>" | |
headers = { | |
'x-csrf-token': token, | |
'x-requested-with': 'XMLHttpRequest', | |
'cookie': cookie | |
} | |
def search(page): | |
time.sleep(1); | |
search_url = SEARCH_URL + '?page=' + str(page) + '&paging_order=0' | |
r = requests.get(search_url, headers=headers) | |
print(search_url, flush=True) | |
print(r.status_code, flush=True) | |
if r.status_code != 200: | |
exit(0) | |
return r.text | |
def get_urls(html): | |
matches = GET_URLS.findall(html) | |
if not matches: | |
return [] | |
urls = [BASE_URL + MATCH_URL.search(match).group(1) for match in matches]; | |
return urls | |
def footprint(urls): | |
for url in urls: | |
time.sleep(SLEEP_TIME) | |
r = requests.get(url, headers=headers) | |
print(url, flush=True) | |
print(r.status_code, flush=True) | |
if r.status_code != 200: | |
exit(0) | |
while True: | |
print('ReStart footpring', flush=True) | |
page = 1 | |
while page < 168: | |
html = search(page) | |
urls = get_urls(html) | |
footprint(urls) | |
page +=1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment