Created
July 16, 2020 07:42
-
-
Save weaming/a43daf67a8a40c511253d0886b8ba460 to your computer and use it in GitHub Desktop.
携程爬虫
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Author : weaming | |
# Mail : garden.yuen@gmail.com | |
# Created : 2020-07-15 14:05:23 | |
import pdb | |
import os | |
from typing import Optional | |
import requests | |
from bs4 import BeautifulSoup | |
from data_process.io_csv import write_csv | |
DEBUG = os.getenv("DEBUG") | |
UA = ( | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 " | |
"(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" | |
) | |
def http_get_url( | |
url, | |
is_json=False, | |
headers=None, | |
params=None, | |
data=None, | |
encoding="utf8", | |
browser=True, | |
): | |
if browser: | |
headers = headers or {} | |
headers["User-Agent"] = UA | |
if data: | |
res = requests.post(url, params=params, headers=headers, data=data) | |
else: | |
res = requests.get(url, params=params, headers=headers, timeout=30) | |
if encoding: | |
res.encoding = encoding | |
err = False if res.status_code == 200 else res.status_code | |
try: | |
data = res.json() if is_json else res.text | |
except Exception as e: | |
return str(e), res.text | |
return err, data | |
def url2soup(url, params=None, encoding="utf8"): | |
# print(soup.prettify()) | |
# soup.find_all('p', class_='chorus') | |
# soup.find_all(id='third') | |
print(url) | |
err, html = http_get_url( | |
url, | |
params=params, | |
is_json=False, | |
headers={"Referer": url, "User-Agent": UA}, | |
encoding=encoding, | |
) | |
if err: | |
raise Exception(err) | |
return html2soup(html) | |
def html2soup(html): | |
soup = BeautifulSoup(html, "html.parser") | |
return soup | |
def join_link(host, current, href): | |
# https://... | |
if href.startswith('http'): | |
return href | |
# ./... | |
if href.startswith('.'): | |
return current + href[(2 if current.endswith('/') else 1) :] | |
# /... | |
return host + href | |
class Crawler: | |
def __init__(self, name, url): | |
self.name = name | |
self.url = url | |
def next_page(self, soup) -> Optional[str]: | |
raise NotImplementedError | |
def data_of_page(self, soup) -> dict: | |
raise NotImplementedError | |
def data(self): | |
print(self.url) | |
url = self.url | |
while 1: | |
soup = url2soup(url) | |
yield from self.data_of_page(soup) | |
url = self.next_page(soup) | |
if url is None: | |
return | |
class Ctrip(Crawler): | |
host = 'https://you.ctrip.com' | |
def data_of_page(self, soup) -> dict: | |
get_text = lambda xs: (xs and xs[0].text.strip() or '') | |
for x in soup.select('div.list_mod2'): | |
try: | |
name = get_text(x.select('a[title]')) | |
score = get_text(x.select('a strong')) | |
addr = get_text(x.select('dd.ellipsis')) | |
rank = get_text(x.select('dt s')) | |
price = get_text(x.select('.price')) | |
comment = get_text(x.select('.bottomcomment')) | |
except IndexError: | |
pdb.set_trace() | |
yield dict( | |
name=name, | |
score=score, | |
addr=addr, | |
rank=rank, | |
price=price, | |
comment=comment, | |
) | |
def next_page(self, soup) -> Optional[str]: | |
try: | |
uri = soup.select('.nextpage')[0]['href'] | |
except Exception as e: | |
print(e) | |
return None | |
return join_link(self.host, self.url, uri) | |
if __name__ == '__main__': | |
c = Ctrip('ctrip', 'https://you.ctrip.com/sightlist/jiujiang877.html') | |
rows = [] | |
for x in c.data(): | |
print(x['name']) | |
rows.append(x) | |
write_csv(rows, './九江.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment