Instantly share code, notes, and snippets.

Embed
What would you like to do?
PHP Serialization Fix. For use after a find/replace/sed on MySQL dump, to repair String lengths in serialized PHP objects. Optimized and refactored for memory-safe production use in Python (~7s for 250MB dump, ~400K replacements). Written also in Perl (~7s for 250MB), Ruby (~8s for 250MB) and Go (~20s for 250MB) for benchmark comparison. Python …
package main
import (
"bufio"
"fmt"
"io"
"os"
"os/exec"
"regexp"
"strconv"
"strings"
)
var counter int = 0
func main() {
// setup regexp
re, err := regexp.Compile(`s:(\d+)(:\\?\")(.*?)(\\?\";)`)
if err != nil {
println(err)
}
defer exec.Command("mv", "db.sql~", "db.sql").Run()
defer exec.Command("rm", "db.sql").Run()
exec.Command("rm", "db.sql", "db.sql~").Run()
exec.Command("cp", "pf.sql", "db.sql").Run()
// open source file
s, err := os.Open("db.sql")
if err != nil {
panic(err)
}
// close fi on exit and check for its returned error
defer func() {
if err := s.Close(); err != nil {
panic(err)
}
}()
d, err := os.Create("db.sql~")
if err != nil {
println(err)
}
defer func() {
if err := d.Close(); err != nil {
panic(err)
}
}()
r := bufio.NewReaderSize(s, 4096)
// h := md5.New()
for {
// read a chunk
line, err := r.ReadString('\n')
if err != nil && err != io.EOF {
panic(err)
}
if line == "" {
fmt.Println(counter, "lines!", "done with this!")
break
}
// write a chunk
d.WriteString(re.ReplaceAllStringFunc(line, replace))
counter = counter + 1
}
}
func replace(matches string) string {
str := strings.Split(matches, ":")
str_len := len(str[2])
if str[2][str_len-3:str_len] == "\\\";" {
return str[0] + ":" + strconv.Itoa(str_len-5) + ":" + str[2]
} else if str[2][str_len-2:str_len] == "\";" {
return str[0] + ":" + strconv.Itoa(str_len-3) + ":" + str[2]
}
// Something's wrong, return what we started with
return matches
}
#!/usr/bin/perl
use strict;
use warnings;
my $src_filepath = "pf.sql";
my $dest_filepath = "db.sql";
sub fix_numbers
{
# Get the subroutine's argument.
my ($orig_num) = $_[0];
my ($string) = $_[2];
my ($statement) = $_[1] . $_[2] . $_[3];
# Get length
my $len = length $string;
if(defined($len))
{
# Got a replacement; return it.
return "s:" . $len . $statement;
}
# No replacement; return original text.
return "s:" . $orig_num . $statement;
}
sub main
{
open(my $SRC, '<', $src_filepath) or die "Can't open $src_filepath: $!";
open(my $DEST, '>', $dest_filepath ) or die "Can't open file $dest_filepath to write: $!";
while (my $line = readline($SRC)) {
## ... process the line in here
$line =~ s/s:(\d+)(:\\?\")(.*?)(\\?\";)/fix_numbers($1, $2, $3, $4)/eig;
print {$DEST} $line;
}
close($SRC);
close($DEST);
}
main();
#!/usr/bin/env python
# *** TESTS ***
#returns properly formatted string
#doesnt lose any characters
#both escaped/nonescaped quotes work
#works with quotes (esc/nonesc) within string
#returns proper #
#correctly handles escape sequences
import os, re
# Regexp to match a PHP serialized string's signature
serialized_token = re.compile(r"s:(\d+)(:\\?\")(.*?)(\\?\";)")
# Raw PHP escape sequences
escape_sequences = (r'\n', r'\r', r'\t', r'\v', r'\"', r'\.')
# Return the serialized string with the corrected string length
def _fix_serialization_instance(matches):
target_str = matches.group(3)
ts_len = len(target_str)
# PHP Serialization counts escape sequences as 1 character, so subtract 1 for each escape sequence found
esc_seq_count = 0
for sequence in escape_sequences:
esc_seq_count += target_str.count(sequence)
ts_len -= esc_seq_count
output = 's:{0}{1}{2}{3}'.format(ts_len, matches.group(2), target_str, matches.group(4))
return output
# Accepts a file or a string
# Iterate over a file in memory-safe way to correct all instances of serialized strings (dumb replacement)
def fix_serialization(file):
try:
with open(file,'r') as s:
d = open(file + "~",'w')
for line in s:
line = re.sub(serialized_token, _fix_serialization_instance, line)
d.write(line)
d.close()
s.close()
os.remove(file)
os.rename(file+'~',file)
print "file serialized"
return True
except:
# Force python to see escape sequences as part of a raw string (NOTE: Python V3 uses `unicode-escape` instead)
raw_file = file.encode('string-escape')
# Simple input test to see if the user is trying to pass a string directly
if isinstance(file,str) and re.search(serialized_token, raw_file):
output = re.sub(serialized_token, _fix_serialization_instance, raw_file)
print output
print "string serialized"
return output
else:
print "Error Occurred: Not a valid input?"
exit()
# EXAMPLES
# fix_serialization('s:2:\"http://murphy.psstudi\r\nosdev.com/wp-content/uploads/2013/03/logo-2.jpg\";')
# fix_serialization('test.txt')
# fix_serialization('texxxxt.txt')
if __name__ == "__main__":
import sys
try:
fix_serialization(sys.argv[1])
except:
print "No File specified, use `python serialize_fix.py [filename]`"
#!/usr/bin/env ruby
#
# Contributed by Mic Alexander, https://github.com/micalexander
#
Encoding.default_external = Encoding::UTF_8
Encoding.default_internal = Encoding::UTF_8
def fix_serialization file
Encoding.default_external = Encoding::UTF_8
Encoding.default_internal = Encoding::UTF_8
string = File.read file
fixed = fix_text string
open file, 'w' do |io|
io.write fixed
end
return file
end
# php escapes:
# "\\" #Backslash, '"' Double quotes, "\'" Single quotes, "\a" Bell/alert,
# "\b" Backspace, "\r" Carriage Return, "\n" New Line, "\s" Space, "\t" Tab
def fix_text string
pattern = /(s\s*:\s*)(\d+)((\s*:\\*["&])(.*?)(\\?\"\s*;))/
php_escapes = /(\\"|\\'|\\\\|\\a|\\b|\\n|\\r|\\s|\\t|\\v)/
string.gsub( pattern ) do |match|
head = $1
tail = $3
count = $5.bytesize - $5.scan(php_escapes).length
"#{head}#{count}#{tail}"
end
end
# test_1 = fix_serialization "s:12:\"robots\\.txt$\";"
# raise test_1 unless "s:12:\"robots\\.txt$\";" == test_1
fix_serialization 'kd-2013-11-01-09-27-production.sql'
a:5:{s:2:"width";i:1200;s:2:"height";i:650;s:1:"file";s:86:"2012/12/home-agave-renamed.jpg";s:5:"sizes";}
a:4:{s:1:"thumbnail";a:4:{s:2:"file";s:60:"home-agave-renamed-430x290.jpg";s:4:"width";i:430;s:6:"height";i:290;s:9:"mime-type";s:10:"image/jpeg";}s:6:"medium";a:4:{s:4:"file";s:30:"home-agave-renamed-460x249.jpg";s:5:"width";i:460;s:6:"height";i:249;s:9:"mime-type";s:10:"image/jpeg";}s:5:"large";a:4:{s:4:"file";s:30:"home-agave-renamed-900x487.jpg";s:5:"width";i:900;s:6:"height";i:487;s:9:"mime-type";s:10:"image/jpeg";}s:8:"detail-2";a:4:{s:4:"file";s:30:"home-agave-renamed-460x517.jpg";s:5:"width";i:460;s:6:"height";i:517;s:9:"mime-type";s:10:"image/jpeg";}}s:10:"image_meta";a:10:{s:8:"aperture";i:0;s:6:"credit";s:0:"";s:6:"camera";s:0:"";s:7:"caption";s:0:"";s:17:"created_timestamp";i:0;s:9:"copyright";s:0:"";s:12:"focal_length";i:0;s:3:"iso";i:0;s:13:"shutter_speed";i:0;s:5:"title";s:0:"";}}
(156,'ttrust_options','a:23:{s:11:\"ttrust_logo\";
s:68:\"http://example.com/wp-content/uploads/2013/03/logo-2.jpg\";
s:0:\"\";
s:163:\"3140 W. Buckeye Road, Phoenix, AZ 85009 | (602) 353-5435\r\n<br /><br />\r\nSupport for this website was provided by a grant from the Robert Wood Johnson Foundation.\";
s:1:\"0\";
}','yes'),(
(157,'ttrust_options','a:23:{s:11:\"ttrust_logo\";
s:71:\"http://example.com/wp-content/uploads/2013/03/logo-2.jpg\";
s:0:\"\";
s:167:\"3140 W. Buckeye Road, Phoenix, AZ 85009 | (602) 353-5435\r\n<br /><br />\r\nSupport for this website was provided by a grant from the Robert Wood Johnson Foundation.\";
s:0:\"\";
}','yes')
@saxena200

This comment has been minimized.

saxena200 commented Sep 5, 2017

Hey the python script is not working

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment