Skip to content

Instantly share code, notes, and snippets.

@devjoe
Created September 15, 2016 13:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save devjoe/e9908962a1e0a0258b71cdda53187bd6 to your computer and use it in GitHub Desktop.
Save devjoe/e9908962a1e0a0258b71cdda53187bd6 to your computer and use it in GitHub Desktop.
範例程式碼:從蟒蛇到神龍 - 從 1 接關繼續打造爬蟲程式 @ 台中拍聚會
# -*- coding: utf-8 -*-
target_url = "https://www.khanacademy.org/"
# -------------------------------------------------------
import whois
#print whois.whois(target_url)
# -------------------------------------------------------
import requests
#resp = requests.get(target_url + "/robots.txt")
#print resp.text
#import robotparser # try and play it
# -------------------------------------------------------
import builtwith
import pprint; p=pprint.pprint; import pdb; pdb.set_trace();
print builtwith.parse(target_url)
# -------------------------------------------------------
# curl vs. httpie
# !curl http://www.meetup.com/Taichung-Python-Meetup/events/227386858/
# !http http://www.meetup.com/Taichung-Python-Meetup/events/227386858/
# -------------------------------------------------------
#from scrapely import Scraper
#s = Scraper()
#train_url = "http://pypi.python.org/pypi/w3lib/1.1"
#data = {'name': 'w3lib 1.1', 'author': 'Scrapy project', 'description': 'Library of web-related functions'}
#s.train(train_url, data)
#test_url = "http://pypi.python.org/pypi/parrot/0.0.9"
#print s.scrape(test_url)
# -------------------------------------------------------
from pyquery import PyQuery as pq
target2_url = "https://www.khanacademy.org/"
#d = pq(url=target2_url)
#links = [ el.attrib.get("href", False) for el in d("a")]
#import pprint; p=pprint.pprint; import pdb; pdb.set_trace();
#print links
#print len(links)
# -------------------------------------------------------
from splinter import Browser
with Browser() as browser:
browser.visit(target2_url)
elements = browser.find_by_tag("a")
links = [ el["href"] for el in elements]
import pprint; p=pprint.pprint; import pdb; pdb.set_trace();
print links
print len(links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment