Skip to content

Instantly share code, notes, and snippets.

Jun-Wei Lin jwlin

Block or report user

Report or block jwlin

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View blog_hahow_crawler_2.json
"_id": "58744feda8aae907000d06c0",
"categories": [
"coverImage": {
"_id": "588421e46ecf3a0700b7a31d",
"url": ""
import requests
def get_web_page(url):
resp = requests.get(
cookies={'over18': '1'}
if resp.status_code != 200:
print('Invalid url:', resp.url)
View cs295-first-look.ipynb
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import requests
import json
import time
import numpy as np
import os
category = {
'55de818a9d1fa51000f94767': '生活',
'55de818d9d1fa51000f94768': '藝術',
'55de819a9d1fa51000f9476b': '運動',
def getTopic(self, ai, imgtopic, Dict):
# JW: features 是準備要被預測 topic 的變數, 應該要挪到下方 prediction 開始前, 程式比較連貫
#extrat the features of the element
features = str(re.sub(' +', ' ', ' '.join(self.extract_features(ai, imgtopic, Dict, 1))))
#print (features)
# JW: 從這裡開始可以獨立切一個 function, 只呼叫一次, 把 train 好的 model 存起來
#open training data file
current_dir = os.path.dirname(_file_)
corpus_dir = os.path.join(current_dir, 'corpus', 'all-corpus')
# -*- coding: utf-8 -*-
Extract features from htmls
import sys, os, random, datetime
from bs4 import BeautifulSoup
# 檔案在這裡
import requests
import time
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json
PTT_URL = ''
You can’t perform that action at this time.