Skip to content

Instantly share code, notes, and snippets.

@ibanez270dx
Created May 19, 2015 01:15
Show Gist options
  • Save ibanez270dx/ff7a66e26e9be9ea8357 to your computer and use it in GitHub Desktop.
Save ibanez270dx/ff7a66e26e9be9ea8357 to your computer and use it in GitHub Desktop.
SKHIP work in progress
require 'io/console'
require 'optparse'
require 'ostruct'
require 'fileutils'
NAME = "Safari Keyword History Index Parser"
VERSION = "v0.0.1"
TIME = Time.now
def box_me_up(str)
width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join
# width = (str.length+2).times.collect{'═'}.join
str = str.center(STDOUT.winsize[-1]-4,' ')
boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n"
end
def show_error_and_exit
puts @option_parser.banner
puts " #{$!}\n use --help for more information\n\n"
exit 1
end
################################################################################
# Command Line Options
################################################################################
# We set default values here.
options = OpenStruct.new
options.name = "HistoryIndex"
options.path = "/Users/#{ENV['USER']}/Library/Safari/"
options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}"
options.line = STDOUT.winsize[-1].times.collect{'─'}.join
options.stdo = true
options.input = "#{options.name}.sk"
options.output = "#{options.name}-#{options.time}.txt"
options.backup = "#{options.name}-#{options.time}.backup"
options.dump = "#{options.name}-#{options.time}.dump"
# Start parsing those options
@option_parser = OptionParser.new do |opts|
opts.program_name = NAME
opts.version = VERSION
opts.banner = "#{box_me_up("#{NAME} #{VERSION}")}
Usage: ruby skhip.rb [options]\n\n"
opts.separator " Specific options:"
opts.on "-i", "--input FILE",
"Path to HistoryIndex.sk" \
do |input|
options.input = input
options.name = input[/(?<=\/)[\w]+(?=\.)/]
options.output = "#{options.name}-#{options.time}.txt"
File.open(input)
end
opts.separator ""
opts.on "-o", "--output FILE",
"Relative output location" \
do |output|
options.output = output
end
opts.separator ""
opts.on "-d", "--use-dump [FILE]",
"Skip dumping process by specifying an existing dump file.",
"Leave blank to use default path." \
do |dump|
options.dump = dump
File.open(dump) if dump
end
opts.separator ""
opts.on_tail("-h", "--help", "What you're looking at :P") do
puts opts
exit
end
opts.on_tail("--version", "Show version") do
puts opts.program_name
puts opts.version
exit
end
end
begin
@option_parser.parse!
raise OptionParser::ParseError.
new("arguments provided without switches!") \
unless ARGV.empty?
rescue show_error_and_exit
end
################################################################################
# Setup
################################################################################
# Make a copy of the HistoryIndex.sk file
FileUtils.cp options.input, "#{options.backup}" \
rescue show_error_and_exit
# Dump it to ASCII chars
`xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \
rescue show_error_and_exit
# Read it in all the fragments
IO.foreach(options.dump) do |input|
(@acc || @acc=[]) << input.split(' ').last \
rescue show_error_and_exit
end
# Make it one big ass string
dumped = @acc.join
################################################################################
# Parsing
################################################################################
parser = [] # collect regexp's and their corresponding replacements
# Mark as a line thingy
parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' }
# looks like there's some code in there. It's the only other place other than
# the the URLs that have single dots. Here, we'll add an extra dot and parse it
# along with the rest 'o that crap.
parser << { regexp: /~(.)+big/, replacement: '*' }
# remove dots between single letters by using regex lookaheads
parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' }
# Execute!
parser.each do |r|
# puts r.inspect
dumped.gsub!(r[:regexp], r[:replacement])
end
################################################################################
# Tokenize that shit
################################################################################
current_index = 0
last_char = ""
@token = ""
@tokens = []
@rejected = []
dumped.split('').each do |char|
char.strip!
if (char == "." && last_char != ".")
# end of a word, add to array
if @token.length > 1
@tokens << case @token
when "http" then "http://"
else @token
end
else
@rejected << @token
end
@token = ""
end
if char =~ /[\w|\-|\+|&|\=|\?]/
@token << char
elsif char != "."
@rejected << char
end
last_char = char
end
################################################################################
# Iterate through tokens to create URL's and newlines
################################################################################
@words = []
last_token = ''
is_url = false
tmp = []
@tokens.each do |token|
if token=~/^http/ && is_url
@words << tmp
tmp = []
elsif token=="SKHIP-PARSER-SEGMENT" && is_url
@words << tmp
tmp = []
is_url = false
elsif token == "http://"
is_url = true
end
if is_url
tmp << token
elsif token=="SKHIP-PARSER-SEGMENT"
@words << options.line
elsif token=~/IADefault/
@words << box_me_up(token)
else
@words << token
end
last_token = token
end
################################################################################
# Print it out
################################################################################
@words.each do |word|
puts word.is_a?(Array) ? word.join('.').sub('.','') : word
end
puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n"
require 'io/console'
require 'optparse'
require 'ostruct'
require 'fileutils'
NAME = "Safari Keyword History Index Parser"
VERSION = "v0.0.1"
TIME = Time.now
def box_me_up(str)
width = (STDOUT.winsize[-1].to_i-2).times.collect{'═'}.join
# width = (str.length+2).times.collect{'═'}.join
str = str.center(STDOUT.winsize[-1]-4,' ')
boxed = "╔#{width}╗\n║ #{str} ║\n╚#{width}╝\n"
end
def show_error_and_exit
puts @option_parser.banner
puts " #{$!}\n use --help for more information\n\n"
exit 1
end
################################################################################
# Command Line Options
################################################################################
# We set default values here.
options = OpenStruct.new
options.name = "HistoryIndex"
options.path = "/Users/#{ENV['USER']}/Library/Safari/"
options.time = "#{TIME.strftime("%Y%m%d%H%M%S")}"
options.line = STDOUT.winsize[-1].times.collect{'─'}.join
options.stdo = true
options.input = "#{options.name}.sk"
options.output = "#{options.name}-#{options.time}.txt"
options.backup = "#{options.name}-#{options.time}.backup"
options.dump = "#{options.name}-#{options.time}.dump"
# Start parsing those options
@option_parser = OptionParser.new do |opts|
opts.program_name = NAME
opts.version = VERSION
opts.banner = "#{box_me_up("#{NAME} #{VERSION}")}
Usage: ruby skhip.rb [options]\n\n"
opts.separator " Specific options:"
opts.on "-i", "--input FILE",
"Path to HistoryIndex.sk" \
do |input|
options.input = input
options.name = input[/(?<=\/)[\w]+(?=\.)/]
options.output = "#{options.name}-#{options.time}.txt"
File.open(input)
end
opts.separator ""
opts.on "-o", "--output FILE",
"Relative output location" \
do |output|
options.output = output
end
opts.separator ""
opts.on "-d", "--use-dump [FILE]",
"Skip dumping process by specifying an existing dump file.",
"Leave blank to use default path." \
do |dump|
options.dump = dump
File.open(dump) if dump
end
opts.separator ""
opts.on_tail("-h", "--help", "What you're looking at :P") do
puts opts
exit
end
opts.on_tail("--version", "Show version") do
puts opts.program_name
puts opts.version
exit
end
end
begin
@option_parser.parse!
raise OptionParser::ParseError.
new("arguments provided without switches!") \
unless ARGV.empty?
rescue show_error_and_exit
end
################################################################################
# Setup
################################################################################
# Make a copy of the HistoryIndex.sk file
FileUtils.cp options.input, "#{options.backup}" \
rescue show_error_and_exit
# Dump it to ASCII chars
`xxd -b -c 10 #{options.input}.backup >> #{options.dump}` \
rescue show_error_and_exit
# Read it in all the fragments
IO.foreach(options.dump) do |input|
(@acc || @acc=[]) << input.split(' ').last \
rescue show_error_and_exit
end
# Make it one big ass string
dumped = @acc.join
################################################################################
# Parsing
################################################################################
parser = [] # collect regexp's and their corresponding replacements
# Make a big o' line breaky thing if there's a lot of dots
parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: '...[SKHIP-PARSER-SEGMENT]...' }
# replace sets of 3 dots with a single dot
# parser << { regexp: /(?<=[^\.])\.{3}(?=[^\.])/, replacement: ',' }
# remove single non-word characters (between two dots)
# parser << { regexp: /(?<=\.)[](?=\.)/, replacement: '..' }
# looks like there's some code in there. It's the only other place other than
# the the URLs that have single dots. Here, we'll add an extra dot and parse it
# along with the rest 'o that crap.
parser << { regexp: /~(.)+big/, replacement: '*' }
# remove dots between single letters by using regex lookaheads
parser << { regexp: /(?<=[^\.])\.{1}(?=[^\.])/, replacement: '' }
# remove single "stand-alone" characters
# parser << { regexp: /(?:\.{2,}|\n)[^-\.]{1,2}(?=\.{2,}|\n)/, replacement: '' }
# gonna assume that URL's HTTP part needs some slashes
# parser << { regexp: /http\.\.(?=[\w])/, replacement: 'http://' }
# put a newline before each URL
# parser << { regexp: /(\/?\.{1,})(?=https?)/, replacement: " " }
# Make a big o' line breaky thing if there's a lot of dots
# parser << { regexp: /(?<=[^\.])\.{256,}(?=[^\.|IA])/, replacement: options.line }
# clean up the ends of the URLs
# parser << { regexp: /\.{2,}(\w|{|}|\\|\d|;)+\n/, replacement: "\n" }
# replace all dot sequences longer than one with a comma
# parser << { regexp: /\.{2,}/, replacement: "," }
# add a newline after "html"... just makes things easier :P
# parser << { regexp: /(?<=html)()[^\n]/, replacement: "\n" }
# surround the headers with a border
# parser << { regexp: /()(?=IA\w+)/, replacement: options.line }
# parser << { regexp: /(?:IADefault)(?:I\w+|T\w+)(\n)/, replacement: options.line }
# Put line breaks in the remaining word blocks
# parser << { regexp: //, replacement: "\n" }
# Execute!
parser.each do |r|
# puts r.inspect
dumped.gsub!(r[:regexp], r[:replacement])
end
puts dumped
################################################################################
# Tokenize that shit
################################################################################
current_index = 0
last_char = ""
@token = ""
@tokens = []
@rejected = []
dumped.split('').each do |char|
char.strip!
if (char == "." && last_char != ".")
# end of a word, add to array
if @token.length > 1
@tokens << case @token
when "http" then "http://"
else @token
end
else
@rejected << @token
end
@token = ""
end
if char =~ /[\w|\-|\+|&|\=|\?]/
@token << char
elsif char != "."
@rejected << char
end
last_char = char
end
puts @tokens.inspect
# puts @rejected.inspect
################################################################################
# Iterate through tokens to create URL's and newlines
################################################################################
@words = []
last_token = ''
is_url = false
tmp = []
@tokens.each do |token|
if token=~/^http/ && is_url
@words << tmp
tmp = []
elsif token=="SKHIP-PARSER-SEGMENT" && is_url
@words << tmp
tmp = []
is_url = false
elsif token == "http://"
is_url = true
end
if is_url
tmp << token
elsif token=="SKHIP-PARSER-SEGMENT"
@words << options.line
elsif token=~/IADefault/
@words << box_me_up(token)
else
@words << token
end
last_token = token
end
puts @words.inspect
@words.each do |word|
w = word.is_a?(Array) ? word.join('.').sub('.','') : word
puts w
end
################################################################################
# Filter Wierd Artifacts
################################################################################
# artifacts = []
# collect regexp's and their corresponding replacements
# artifacts << { regexp: /\.\=\=/, replacement: '' }
# artifacts << { regexp: /http0/, replacement: 'http:' }
# artifacts << /http\n.+\n/
# artifacts << /z\.{+\n/
# artifacts << /E\.F\.\w/
# artifacts << /Bud2/
# artifacts << /.?\.["|-]/
# Execute!
# artifacts.each do |artifact|
# regexp = Regexp.new artifact
# words.match(regexp).to_a.each do |match|
# (@removals || @removals=[]) << match.to_s
# end
# words.gsub! regexp, ''
# end
# Remove double spaces
# words.gsub!(/\n{2,}/,"\n")
#
# artifacts.each do |a|
# dumped.gsub!(r[:regexp], r[:replacement])
# end
# puts words
# puts box_me_up('Artifact Removals:')
# @removals.each { |x| puts x }
# output = File.open("history_index_output.txt", 'w+')
# words.each do |word|
# output.puts word
# end
#
# output.close
puts "\n\nFinished! Parsing took #{(Time.now - TIME).round(2)} seconds\n\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment