Skip to content

Instantly share code, notes, and snippets.

@nkt1546789
Last active August 29, 2015 14:25
Show Gist options
  • Save nkt1546789/51eb6b83694ca09d47fc to your computer and use it in GitHub Desktop.
Save nkt1546789/51eb6b83694ca09d47fc to your computer and use it in GitHub Desktop.
A wrapper of dragnet's content extractor for thresholding correctly.
from dragnet import content_extractor
classes=list(content_extractor._block_model.classes_)
positive_idx=classes.index(1)
def extract(html,block=False,threshold=0.2):
features,blocks=content_extractor.make_features(html)
scores=content_extractor._block_model.predict_proba(features)[:,positive_idx]
if block:
return [block for i,block in enumerate(blocks) if scores[i]>=threshold]
return " ".join([block.text for i,block in enumerate(blocks) if scores[i]>=threshold])
# demo
import requests
url="your url"
html=requests.get(url).content
print extract(html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment