View extract_features.py
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
Extract features from htmls | |
""" | |
import sys, os, random, datetime | |
from bs4 import BeautifulSoup | |
# preprocess.py 檔案在這裡 |
View getTopic.py
def getTopic(self, ai, imgtopic, Dict): | |
# JW: features 是準備要被預測 topic 的變數, 應該要挪到下方 prediction 開始前, 程式比較連貫 | |
#extrat the features of the element | |
features = str(re.sub(' +', ' ', ' '.join(self.extract_features(ai, imgtopic, Dict, 1)))) | |
#print (features) | |
# JW: 從這裡開始可以獨立切一個 function, 只呼叫一次, 把 train 好的 model 存起來 | |
#open training data file | |
current_dir = os.path.dirname(_file_) | |
corpus_dir = os.path.join(current_dir, 'corpus', 'all-corpus') |
View cs295-first-look.ipynb

Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View blog_hahow_crawler_3.py
with open('hahow_courses.json', 'r', encoding='utf-8') as f: | |
courses = json.load(f) | |
# 取出程式類課程的募資價/上線價/學生數,並顯示統計資料 | |
pre_order_prices = list() | |
prices = list() | |
tickets = list() | |
lengths = list() | |
for c in courses: | |
if '55de81ac9d1fa51000f94770' in c['categories']: |
View blog_hahow_crawler_2.json
{ | |
"_id": "58744feda8aae907000d06c0", | |
"categories": [ | |
"55de81ac9d1fa51000f94770", | |
"55de81929d1fa51000f94769" | |
], | |
"coverImage": { | |
"_id": "588421e46ecf3a0700b7a31d", | |
"url": "https://hahow.in/images/588421e46ecf3a0700b7a31d" | |
}, |
View blog_hahow_crawler_1.py
def crawl(): | |
# 初始 API: https://api.hahow.in/api/courses?limit=12&status=PUBLISHED | |
# 接續 API: https://api.hahow.in/api/courses?latestId=54d5a117065a7e0e00725ac0&latestValue=2015-03-27T15:38:27.187Z&limit=30&status=PUBLISHED | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' | |
'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
'Chrome/59.0.3071.115 Safari/537.36'} | |
url = 'https://api.hahow.in/api/courses' | |
courses = list() | |
resp_courses = requests.get(url + '?limit=30&status=PUBLISHED', headers=headers).json() | |
while resp_courses: # 有回傳資料則繼續下一輪擷取 |
View hahow_courses.json
[ | |
{ | |
"_id": "58d5c70c27ea7d070060160e", | |
"categories": [ | |
"55de81ac9d1fa51000f94770", | |
"55de81929d1fa51000f94769", | |
"55de81879d1fa51000f94766" | |
], | |
"coverImage": { | |
"_id": "58f318cc4909c907004ac575", |
View hahow_crawler.py
import requests | |
import json | |
import time | |
import numpy as np | |
import os | |
category = { | |
'55de818a9d1fa51000f94767': '生活', | |
'55de818d9d1fa51000f94768': '藝術', | |
'55de819a9d1fa51000f9476b': '運動', |
View example.json
[ | |
{ | |
"href": "/bbs/Beauty/M.1482072854.A.DDC.html", | |
"num_image": 3, | |
"push_count": 18, | |
"title": "[神人] 長榮空姐" | |
}, | |
{ | |
"href": "/bbs/Beauty/M.1482075654.A.C1D.html", | |
"num_image": 7, |
View tutorial5_demo.py
import json | |
import math | |
from collections import Counter | |
from matplotlib import pyplot as plt | |
def mean(x): | |
return sum(x) / len(x) | |
NewerOlder