Skip to content

Instantly share code, notes, and snippets.

@weaming
Created July 16, 2020 07:42
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save weaming/a43daf67a8a40c511253d0886b8ba460 to your computer and use it in GitHub Desktop.
Save weaming/a43daf67a8a40c511253d0886b8ba460 to your computer and use it in GitHub Desktop.
携程爬虫
#!/usr/bin/env python3
# Author : weaming
# Mail : garden.yuen@gmail.com
# Created : 2020-07-15 14:05:23
import pdb
import os
from typing import Optional
import requests
from bs4 import BeautifulSoup
from data_process.io_csv import write_csv
DEBUG = os.getenv("DEBUG")
UA = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
)
def http_get_url(
url,
is_json=False,
headers=None,
params=None,
data=None,
encoding="utf8",
browser=True,
):
if browser:
headers = headers or {}
headers["User-Agent"] = UA
if data:
res = requests.post(url, params=params, headers=headers, data=data)
else:
res = requests.get(url, params=params, headers=headers, timeout=30)
if encoding:
res.encoding = encoding
err = False if res.status_code == 200 else res.status_code
try:
data = res.json() if is_json else res.text
except Exception as e:
return str(e), res.text
return err, data
def url2soup(url, params=None, encoding="utf8"):
# print(soup.prettify())
# soup.find_all('p', class_='chorus')
# soup.find_all(id='third')
print(url)
err, html = http_get_url(
url,
params=params,
is_json=False,
headers={"Referer": url, "User-Agent": UA},
encoding=encoding,
)
if err:
raise Exception(err)
return html2soup(html)
def html2soup(html):
soup = BeautifulSoup(html, "html.parser")
return soup
def join_link(host, current, href):
# https://...
if href.startswith('http'):
return href
# ./...
if href.startswith('.'):
return current + href[(2 if current.endswith('/') else 1) :]
# /...
return host + href
class Crawler:
def __init__(self, name, url):
self.name = name
self.url = url
def next_page(self, soup) -> Optional[str]:
raise NotImplementedError
def data_of_page(self, soup) -> dict:
raise NotImplementedError
def data(self):
print(self.url)
url = self.url
while 1:
soup = url2soup(url)
yield from self.data_of_page(soup)
url = self.next_page(soup)
if url is None:
return
class Ctrip(Crawler):
host = 'https://you.ctrip.com'
def data_of_page(self, soup) -> dict:
get_text = lambda xs: (xs and xs[0].text.strip() or '')
for x in soup.select('div.list_mod2'):
try:
name = get_text(x.select('a[title]'))
score = get_text(x.select('a strong'))
addr = get_text(x.select('dd.ellipsis'))
rank = get_text(x.select('dt s'))
price = get_text(x.select('.price'))
comment = get_text(x.select('.bottomcomment'))
except IndexError:
pdb.set_trace()
yield dict(
name=name,
score=score,
addr=addr,
rank=rank,
price=price,
comment=comment,
)
def next_page(self, soup) -> Optional[str]:
try:
uri = soup.select('.nextpage')[0]['href']
except Exception as e:
print(e)
return None
return join_link(self.host, self.url, uri)
if __name__ == '__main__':
c = Ctrip('ctrip', 'https://you.ctrip.com/sightlist/jiujiang877.html')
rows = []
for x in c.data():
print(x['name'])
rows.append(x)
write_csv(rows, './九江.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment