Skip to content

Instantly share code, notes, and snippets.

Jun-Wei Lin jwlin

Block or report user

Report or block jwlin

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View extract_features.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Extract features from htmls
"""
import sys, os, random, datetime
from bs4 import BeautifulSoup
# preprocess.py 檔案在這裡
View getTopic.py
def getTopic(self, ai, imgtopic, Dict):
# JW: features 是準備要被預測 topic 的變數, 應該要挪到下方 prediction 開始前, 程式比較連貫
#extrat the features of the element
features = str(re.sub(' +', ' ', ' '.join(self.extract_features(ai, imgtopic, Dict, 1))))
#print (features)
# JW: 從這裡開始可以獨立切一個 function, 只呼叫一次, 把 train 好的 model 存起來
#open training data file
current_dir = os.path.dirname(_file_)
corpus_dir = os.path.join(current_dir, 'corpus', 'all-corpus')
View cs295-first-look.ipynb
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View blog_hahow_crawler_3.py
with open('hahow_courses.json', 'r', encoding='utf-8') as f:
courses = json.load(f)
# 取出程式類課程的募資價/上線價/學生數,並顯示統計資料
pre_order_prices = list()
prices = list()
tickets = list()
lengths = list()
for c in courses:
if '55de81ac9d1fa51000f94770' in c['categories']:
View blog_hahow_crawler_2.json
{
"_id": "58744feda8aae907000d06c0",
"categories": [
"55de81ac9d1fa51000f94770",
"55de81929d1fa51000f94769"
],
"coverImage": {
"_id": "588421e46ecf3a0700b7a31d",
"url": "https://hahow.in/images/588421e46ecf3a0700b7a31d"
},
View blog_hahow_crawler_1.py
def crawl():
# 初始 API: https://api.hahow.in/api/courses?limit=12&status=PUBLISHED
# 接續 API: https://api.hahow.in/api/courses?latestId=54d5a117065a7e0e00725ac0&latestValue=2015-03-27T15:38:27.187Z&limit=30&status=PUBLISHED
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/59.0.3071.115 Safari/537.36'}
url = 'https://api.hahow.in/api/courses'
courses = list()
resp_courses = requests.get(url + '?limit=30&status=PUBLISHED', headers=headers).json()
while resp_courses: # 有回傳資料則繼續下一輪擷取
View hahow_courses.json
[
{
"_id": "58d5c70c27ea7d070060160e",
"categories": [
"55de81ac9d1fa51000f94770",
"55de81929d1fa51000f94769",
"55de81879d1fa51000f94766"
],
"coverImage": {
"_id": "58f318cc4909c907004ac575",
View hahow_crawler.py
import requests
import json
import time
import numpy as np
import os
category = {
'55de818a9d1fa51000f94767': '生活',
'55de818d9d1fa51000f94768': '藝術',
'55de819a9d1fa51000f9476b': '運動',
View example.json
[
{
"href": "/bbs/Beauty/M.1482072854.A.DDC.html",
"num_image": 3,
"push_count": 18,
"title": "[神人] 長榮空姐"
},
{
"href": "/bbs/Beauty/M.1482075654.A.C1D.html",
"num_image": 7,
View tutorial5_demo.py
import json
import math
from collections import Counter
from matplotlib import pyplot as plt
def mean(x):
return sum(x) / len(x)
You can’t perform that action at this time.