Last active
July 6, 2016 11:05
-
-
Save ashim888/797cae73c921e282a56db0dc477455e1 to your computer and use it in GitHub Desktop.
Child Safety Detection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from pymongo import MongoClient | |
import pprint | |
from nltk.corpus import stopwords | |
from nltk.tokenize import RegexpTokenizer | |
from microsofttranslator import Translator | |
import langid | |
porn_list=["anal","hentai","anus","arse","butt","arsehole","ass","fcuk","fuck","naked","xvideos","porn", "sex", "porno", "free porn", "porn tube", "porn videos", "streaming porn","Free porn", "sex videos","pussy","Porn hub", "xxx" "porn", "sex" ] | |
def safety_check(domain): | |
tokenizer = RegexpTokenizer(r'\w+') | |
client = MongoClient('mongodb://192.168.1.10:27017/',27017) | |
db = client.cutestat_v3 | |
try: | |
cursor = db.WebInfo.find_one({"domain":domain}) | |
if cursor!=None: | |
stop = stopwords.words('english') | |
overall_text=str(cursor['title'])+str(cursor['metaDescription']) + str(cursor['metaTags']) | |
# Language Detect | |
language_prediction=langid.classify(overall_text) | |
if language_prediction!=None: | |
language = language_prediction[0] | |
# TRANSLATE LANGUAGE | |
translator = Translator('<Your Client ID>', '<Your Client Secret>') | |
if language!='en': | |
print 'Another language Found: '+ language | |
overall_text= translator.translate(overall_text, "en") | |
overall_text=set([i.lower() for i in tokenizer.tokenize(overall_text) if i not in stop]) | |
# check if any key matches with each other | |
count=overall_text.intersection(set(porn_list)) | |
if len(count)>0: | |
print domain+" SAFETY CHECK FAIL" | |
print "Total Abusive Keywords Found:", len(count) | |
print '\n' | |
else: | |
print domain+" SAFETY CHECK PASS" | |
print '\n' | |
else: | |
print domain + ' Not Found In Database' +'\n' | |
except TypeError as exc: | |
print domain + " Not found" | |
except UnboundLocalError as exc: | |
print exc | |
except Exception as exc: | |
print exc | |
safety_check("www.befuck.com") | |
safety_check("baidu.com") | |
safety_check("gioia.it") | |
safety_check("partyporn.co.il") | |
safety_check("x-nxx.co.il") | |
safety_check("jw.org") | |
safety_check("xhamster.com") | |
safety_check("www.xnxx.com") | |
safety_check("ratopati.com") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment