Skip to content

Instantly share code, notes, and snippets.

@jwlin
Created November 2, 2016 05:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jwlin/6b01ecb2c58df6e7c7ee3d141ca37600 to your computer and use it in GitHub Desktop.
Save jwlin/6b01ecb2c58df6e7c7ee3d141ca37600 to your computer and use it in GitHub Desktop.
import os, json
from bs4 import BeautifulSoup
import preprocess
if __name__ == '__main__':
current_dir = os.path.dirname(__file__)
form_dir = os.path.join(current_dir, 'forms')
input_dir = os.path.join(current_dir, 'corpus', 'all-input')
input_types = ['text', 'email', 'password']
answer = dict()
with open(os.path.join(current_dir, 'corpus', 'label-all-corpus.json'), 'r') as f:
data = json.load(f)
for k, v in data.items():
if v['feature'] in answer.keys():
assert answer[v['feature']] == v['type']
else:
answer[v['feature']] = v['type']
for fname in os.listdir(form_dir):
data = list()
with open(os.path.join(current_dir, form_dir, fname), 'r') as f:
dom = f.read().lower()
soup = BeautifulSoup(dom, 'html5lib')
for input_type in input_types:
for input_tag in soup.find_all('input', attrs={'type': input_type}):
feature = preprocess.extract_features(input_tag)
feature = ' '.join(feature.split())
data.append({
'type': answer[feature],
'feature': feature,
'dom': str(input_tag),
'rule':
{
'type': [],
'id': [],
'name': []
}
})
file_name, extension = os.path.splitext(fname)
c_path = os.path.join(input_dir, file_name + '.input')
with open(c_path, 'a') as cf:
json.dump(data, cf, indent=2, sort_keys=True, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment