#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
Extract features from htmls | |
""" | |
import sys, os, random, datetime | |
from bs4 import BeautifulSoup | |
# preprocess.py 檔案在這裡 | |
#https://github.com/jwlin/icst2017/blob/master/preprocess.py | |
from preprocess import extract_features | |
# randomly pick 20 htmls as training data (corpus) | |
def random_pick(parent_dir): | |
ids = [] | |
for fname in os.listdir(parent_dir): | |
ids.append(fname.split('-')[0]) | |
random.shuffle(ids) | |
#return ids[:20] | |
return ids | |
if __name__ == '__main__': | |
current_dir = os.path.dirname(__file__) | |
form_dir = 'forms' | |
train_dir = os.path.join(current_dir, 'corpus', 'trial-' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')) | |
os.makedirs(train_dir) | |
input_types = ['text', 'email', 'password'] | |
training_ids = random_pick(os.path.join(current_dir, form_dir)) | |
print 'training_ids:', training_ids | |
for fname in os.listdir(os.path.join(current_dir, form_dir)): | |
if fname.split('-')[0] in training_ids: | |
with open(os.path.join(current_dir, form_dir, fname), 'r') as f: | |
dom = f.read().lower() | |
soup = BeautifulSoup(dom, 'html5lib') | |
file_name, extension = os.path.splitext(fname) | |
c_path = os.path.join(current_dir, train_dir, file_name + '.corpus') | |
for input_type in input_types: | |
for input_tag in soup.find_all('input', attrs={'type': input_type}): | |
with open(c_path, 'a') as cf: | |
cf.write(extract_features(input_tag) + '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment