Created
January 27, 2016 21:44
-
-
Save anonymous/f49fd41c16f9a2f0bcfa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'json' | |
require 'fuzzystringmatch' | |
require 'decisiontree' | |
#what are you returning?? | |
# process inputs into some form and feed them into classifier | |
$training = Array.new | |
def download_extractor(input) | |
#get zips, rars, exes, | |
puts "Entered download_extractor" | |
begin | |
input.each do |p| | |
out = p["message"].scan('\b(.*\.(?:zip|rar|exe|html))\b').size | |
puts out if out!=0 | |
end | |
rescue => e | |
end | |
end | |
def unicode_checker(input) | |
avg = 0 | |
input.each do |lol| | |
begin | |
succ = lol["message"].scan(/[^\P{L}a-zA-Z0-9]/).size | |
puts lol["message"] if succ!=0 | |
#puts "Done" | |
avg += succ/(lol["message"].len) | |
rescue => e | |
next | |
end | |
end | |
p avg | |
return (avg/input.length) | |
end | |
def smiley_counter(input) | |
max = avg = 0 | |
input.each do |p| | |
count = 0 | |
begin | |
p | |
#puts p["message"] | |
#puts "entered smiley" | |
succ=p["message"].scan(/(?:[3O>]?)\:(?:(?:[\)PDO*\\\|\3])|(?:'\())|(<3)|(o.O)|(^.^)|(;\))|(8(?:-\)|\|)){1,}/).size | |
max = succ if succ>max | |
avg += succ/(p["message"].len) | |
rescue => e | |
next | |
end | |
end | |
avg = avg/(input.length) | |
p avg | |
return avg | |
end | |
def ascii_checker(input) | |
avg = 0 | |
input.each do |p| | |
count = 0 | |
begin | |
p["message"].chars do |m| | |
unless m.ascii_only? | |
count+=1 | |
end | |
end | |
avg += count/(p["message"].len) | |
rescue => e | |
next | |
end | |
#puts "#{count} #{p["message"].length}" | |
end | |
p avg | |
return avg/input.length | |
end | |
def calc_similarity(input) | |
jarow = FuzzyStringMatch::JaroWinkler.create( :pure ) | |
max = 0 | |
min = 1 | |
avg = 0 | |
input.each_with_index do |c, ind1| | |
input.each_with_index do |d, ind2| | |
begin | |
if ind1 == ind2 | |
#puts "#{ind1} #{ind2}" | |
next | |
else | |
now = jarow.getDistance(c["message"],d["message"]) | |
max = now if now > max | |
min = now if now < min | |
avg += now/([c["message"].len, d["message"].len].max) | |
if now == 1 | |
puts c["message"]. d["message"] | |
end | |
end | |
rescue => e | |
next | |
end | |
end | |
end | |
avg = avg / input.length | |
puts max, min | |
return max, avg | |
end | |
def story_examiner(input) | |
avg = 0 | |
input.each do |l| | |
begin | |
avg+=l["story"].length | |
rescue => e | |
next | |
end | |
end | |
avg/=input.length | |
return avg | |
end | |
def post_processor(dataHash, var = 0) | |
outHash = Hash.new | |
my_arr = Array.new | |
begin | |
dataHash.keys.each do |k| | |
if k == "posts" | |
#p "SUCCESS" | |
#puts dataHash[k] | |
avg_ascii = ascii_checker(dataHash[k]) | |
"ascii worked" | |
max_sim, avg_sim = calc_similarity(dataHash[k]) | |
"good start" | |
avg_smileys = smiley_counter(dataHash[k]) | |
#download_extractor(dataHash[k]) | |
p "getting close" | |
avg_unicode = unicode_checker(dataHash[k]) | |
p "Almost there" | |
avg_story = story_examiner(dataHash[k]) | |
p "worked" | |
puts avg_unicode, avg_ascii, avg_smileys, max_sim, avg_sim | |
puts "Done" | |
my_arr<<avg_ascii<<max_sim<<avg_sim<<avg_smileys<<avg_unicode<<var | |
puts my_arr | |
end | |
end | |
# my_arr<<avg_ascii<<max_sim<<avg_sim<<avg_smileys<<avg_unicode | |
rescue => e | |
puts e | |
end | |
puts "MY ARR = #{my_arr}" | |
puts "We done here" | |
$training<<my_arr | |
puts $training | |
end | |
def train() | |
#These two lines are only for getting malicious files out and processing only them | |
File.open('mal_pages_as_ids.txt') do |alt| | |
while f = alt.gets | |
#puts f | |
f = f.chomp! | |
#Dir.entries(Dir.pwd).select {|f| f.include? ".json"}.each do |f| | |
begin | |
me = File.read("#{f}.json") | |
dataHash = Hash.new | |
dataHash = JSON.parse(me) | |
post_processor(dataHash,1) | |
rescue => e | |
next | |
p "didn't work for #{f}" | |
end | |
end | |
end | |
end | |
def main() | |
train() | |
puts "Caught rat" | |
puts $training | |
puts "BA DUM TSS" | |
end | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment