Skip to content

Instantly share code, notes, and snippets.

@satellitex
Created May 6, 2020 05:49
Show Gist options
  • Save satellitex/bc2fa1c53bc9b8f2a29e8f98fe339d6d to your computer and use it in GitHub Desktop.
Save satellitex/bc2fa1c53bc9b8f2a29e8f98fe339d6d to your computer and use it in GitHub Desktop.
import requests
import re
import time
import sys
BASE_URL = 'https://with.is'
SEARCH_URL = BASE_URL + '/search'
SLEEP_TIME = 5
GET_URLS = re.compile(r'<a\s+class="link-area\s+needsclick\s+append-anchor"[^>]+>')
MATCH_URL = re.compile('href="(.+)">')
if len(sys.argv) <= 3:
token = sys.argv[1]
cookie = sys.argv[2]
else:
token = "<your_token>"
cookie = "s=<your_cookie_token>"
headers = {
'x-csrf-token': token,
'x-requested-with': 'XMLHttpRequest',
'cookie': cookie
}
def search(page):
time.sleep(1);
search_url = SEARCH_URL + '?page=' + str(page) + '&paging_order=0'
r = requests.get(search_url, headers=headers)
print(search_url, flush=True)
print(r.status_code, flush=True)
if r.status_code != 200:
exit(0)
return r.text
def get_urls(html):
matches = GET_URLS.findall(html)
if not matches:
return []
urls = [BASE_URL + MATCH_URL.search(match).group(1) for match in matches];
return urls
def footprint(urls):
for url in urls:
time.sleep(SLEEP_TIME)
r = requests.get(url, headers=headers)
print(url, flush=True)
print(r.status_code, flush=True)
if r.status_code != 200:
exit(0)
while True:
print('ReStart footpring', flush=True)
page = 1
while page < 168:
html = search(page)
urls = get_urls(html)
footprint(urls)
page +=1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment