Skip to content

Instantly share code, notes, and snippets.

Jun-Wei Lin jwlin

Block or report user

Report or block jwlin

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View feature_extraction.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Extract features from htmls
"""
import sys, os, random, datetime
from bs4 import BeautifulSoup
from preprocess import extract_features
View generate_input.py
import os, json
from bs4 import BeautifulSoup
import preprocess
if __name__ == '__main__':
current_dir = os.path.dirname(__file__)
form_dir = os.path.join(current_dir, 'forms')
input_dir = os.path.join(current_dir, 'corpus', 'all-input')
input_types = ['text', 'email', 'password']
View requests_demo.py
import requests
def get_web_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'}
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
View requirement.txt
beautifulsoup4
cycler
matplotlib
numpy
pyparsing
python-dateutil
pytz
requests
six
View beautifulsoup_demo.py
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>我是網頁標題</title>
<style>
.large {
color:blue;
text-align: center;
View tutorial3_demo.py
import requests
import time
from bs4 import BeautifulSoup
def get_web_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'}
)
View tutorial4_demo.py
import requests
import time
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json
PTT_URL = 'https://www.ptt.cc'
View tutorial5_demo.py
import json
import math
from collections import Counter
from matplotlib import pyplot as plt
def mean(x):
return sum(x) / len(x)
View example.json
[
{
"href": "/bbs/Beauty/M.1482072854.A.DDC.html",
"num_image": 3,
"push_count": 18,
"title": "[神人] 長榮空姐"
},
{
"href": "/bbs/Beauty/M.1482075654.A.C1D.html",
"num_image": 7,
View hahow_crawler.py
import requests
import json
import time
import numpy as np
import os
category = {
'55de818a9d1fa51000f94767': '生活',
'55de818d9d1fa51000f94768': '藝術',
'55de819a9d1fa51000f9476b': '運動',
You can’t perform that action at this time.