Skip to content

Instantly share code, notes, and snippets.

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Extract features from htmls
"""
import sys, os, random, datetime
from bs4 import BeautifulSoup
from preprocess import extract_features
import os, json
from bs4 import BeautifulSoup
import preprocess
if __name__ == '__main__':
current_dir = os.path.dirname(__file__)
form_dir = os.path.join(current_dir, 'forms')
input_dir = os.path.join(current_dir, 'corpus', 'all-input')
input_types = ['text', 'email', 'password']
beautifulsoup4
cycler
matplotlib
numpy
pyparsing
python-dateutil
pytz
requests
six
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>我是網頁標題</title>
<style>
.large {
color:blue;
text-align: center;
import json
import math
from collections import Counter
from matplotlib import pyplot as plt
def mean(x):
return sum(x) / len(x)
[
{
"href": "/bbs/Beauty/M.1482072854.A.DDC.html",
"num_image": 3,
"push_count": 18,
"title": "[神人] 長榮空姐"
},
{
"href": "/bbs/Beauty/M.1482075654.A.C1D.html",
"num_image": 7,
import requests
import time
from bs4 import BeautifulSoup
def get_web_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'}
)
[
{
"_id": "58d5c70c27ea7d070060160e",
"categories": [
"55de81ac9d1fa51000f94770",
"55de81929d1fa51000f94769",
"55de81879d1fa51000f94766"
],
"coverImage": {
"_id": "58f318cc4909c907004ac575",
def crawl():
# 初始 API: https://api.hahow.in/api/courses?limit=12&status=PUBLISHED
# 接續 API: https://api.hahow.in/api/courses?latestId=54d5a117065a7e0e00725ac0&latestValue=2015-03-27T15:38:27.187Z&limit=30&status=PUBLISHED
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/59.0.3071.115 Safari/537.36'}
url = 'https://api.hahow.in/api/courses'
courses = list()
resp_courses = requests.get(url + '?limit=30&status=PUBLISHED', headers=headers).json()
while resp_courses: # 有回傳資料則繼續下一輪擷取
with open('hahow_courses.json', 'r', encoding='utf-8') as f:
courses = json.load(f)
# 取出程式類課程的募資價/上線價/學生數,並顯示統計資料
pre_order_prices = list()
prices = list()
tickets = list()
lengths = list()
for c in courses:
if '55de81ac9d1fa51000f94770' in c['categories']: