Skip to content

Instantly share code, notes, and snippets.

import json
import math
from collections import Counter
from matplotlib import pyplot as plt
def mean(x):
return sum(x) / len(x)
import requests
import time
from bs4 import BeautifulSoup
import os
import re
import urllib.request
import json
PTT_URL = 'https://www.ptt.cc'
import requests
import time
from bs4 import BeautifulSoup
def get_web_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'}
)
from bs4 import BeautifulSoup
html_doc = """
<html>
<head>
<title>我是網頁標題</title>
<style>
.large {
color:blue;
text-align: center;
beautifulsoup4
cycler
matplotlib
numpy
pyparsing
python-dateutil
pytz
requests
six
import requests
def get_web_page(url):
resp = requests.get(
url=url,
cookies={'over18': '1'}
)
if resp.status_code != 200:
print('Invalid url:', resp.url)
import os, json
from bs4 import BeautifulSoup
import preprocess
if __name__ == '__main__':
current_dir = os.path.dirname(__file__)
form_dir = os.path.join(current_dir, 'forms')
input_dir = os.path.join(current_dir, 'corpus', 'all-input')
input_types = ['text', 'email', 'password']
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Extract features from htmls
"""
import sys, os, random, datetime
from bs4 import BeautifulSoup
from preprocess import extract_features