Skip to content

Instantly share code, notes, and snippets.

Created January 27, 2016 21:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/f49fd41c16f9a2f0bcfa to your computer and use it in GitHub Desktop.
Save anonymous/f49fd41c16f9a2f0bcfa to your computer and use it in GitHub Desktop.
require 'json'
require 'fuzzystringmatch'
require 'decisiontree'
#what are you returning??
# process inputs into some form and feed them into classifier
$training = Array.new
def download_extractor(input)
#get zips, rars, exes,
puts "Entered download_extractor"
begin
input.each do |p|
out = p["message"].scan('\b(.*\.(?:zip|rar|exe|html))\b').size
puts out if out!=0
end
rescue => e
end
end
def unicode_checker(input)
avg = 0
input.each do |lol|
begin
succ = lol["message"].scan(/[^\P{L}a-zA-Z0-9]/).size
puts lol["message"] if succ!=0
#puts "Done"
avg += succ/(lol["message"].len)
rescue => e
next
end
end
p avg
return (avg/input.length)
end
def smiley_counter(input)
max = avg = 0
input.each do |p|
count = 0
begin
p
#puts p["message"]
#puts "entered smiley"
succ=p["message"].scan(/(?:[3O>]?)\:(?:(?:[\)PDO*\\\|\3])|(?:'\())|(<3)|(o.O)|(^.^)|(;\))|(8(?:-\)|\|)){1,}/).size
max = succ if succ>max
avg += succ/(p["message"].len)
rescue => e
next
end
end
avg = avg/(input.length)
p avg
return avg
end
def ascii_checker(input)
avg = 0
input.each do |p|
count = 0
begin
p["message"].chars do |m|
unless m.ascii_only?
count+=1
end
end
avg += count/(p["message"].len)
rescue => e
next
end
#puts "#{count} #{p["message"].length}"
end
p avg
return avg/input.length
end
def calc_similarity(input)
jarow = FuzzyStringMatch::JaroWinkler.create( :pure )
max = 0
min = 1
avg = 0
input.each_with_index do |c, ind1|
input.each_with_index do |d, ind2|
begin
if ind1 == ind2
#puts "#{ind1} #{ind2}"
next
else
now = jarow.getDistance(c["message"],d["message"])
max = now if now > max
min = now if now < min
avg += now/([c["message"].len, d["message"].len].max)
if now == 1
puts c["message"]. d["message"]
end
end
rescue => e
next
end
end
end
avg = avg / input.length
puts max, min
return max, avg
end
def story_examiner(input)
avg = 0
input.each do |l|
begin
avg+=l["story"].length
rescue => e
next
end
end
avg/=input.length
return avg
end
def post_processor(dataHash, var = 0)
outHash = Hash.new
my_arr = Array.new
begin
dataHash.keys.each do |k|
if k == "posts"
#p "SUCCESS"
#puts dataHash[k]
avg_ascii = ascii_checker(dataHash[k])
"ascii worked"
max_sim, avg_sim = calc_similarity(dataHash[k])
"good start"
avg_smileys = smiley_counter(dataHash[k])
#download_extractor(dataHash[k])
p "getting close"
avg_unicode = unicode_checker(dataHash[k])
p "Almost there"
avg_story = story_examiner(dataHash[k])
p "worked"
puts avg_unicode, avg_ascii, avg_smileys, max_sim, avg_sim
puts "Done"
my_arr<<avg_ascii<<max_sim<<avg_sim<<avg_smileys<<avg_unicode<<var
puts my_arr
end
end
# my_arr<<avg_ascii<<max_sim<<avg_sim<<avg_smileys<<avg_unicode
rescue => e
puts e
end
puts "MY ARR = #{my_arr}"
puts "We done here"
$training<<my_arr
puts $training
end
def train()
#These two lines are only for getting malicious files out and processing only them
File.open('mal_pages_as_ids.txt') do |alt|
while f = alt.gets
#puts f
f = f.chomp!
#Dir.entries(Dir.pwd).select {|f| f.include? ".json"}.each do |f|
begin
me = File.read("#{f}.json")
dataHash = Hash.new
dataHash = JSON.parse(me)
post_processor(dataHash,1)
rescue => e
next
p "didn't work for #{f}"
end
end
end
end
def main()
train()
puts "Caught rat"
puts $training
puts "BA DUM TSS"
end
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment