Skip to content

Instantly share code, notes, and snippets.

@astockwell
Last active December 30, 2023 22:39
  • Star 12 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save astockwell/6489489 to your computer and use it in GitHub Desktop.
PHP Serialization Fix. For use after a find/replace/sed on MySQL dump, to repair String lengths in serialized PHP objects. Optimized and refactored for memory-safe production use in Python (~7s for 250MB dump, ~400K replacements). Written also in Perl (~7s for 250MB), Ruby (~8s for 250MB) and Go (~20s for 250MB) for benchmark comparison. Python …
package main
import (
"bufio"
"fmt"
"io"
"os"
"os/exec"
"regexp"
"strconv"
"strings"
)
var counter int = 0
func main() {
// setup regexp
re, err := regexp.Compile(`s:(\d+)(:\\?\")(.*?)(\\?\";)`)
if err != nil {
println(err)
}
defer exec.Command("mv", "db.sql~", "db.sql").Run()
defer exec.Command("rm", "db.sql").Run()
exec.Command("rm", "db.sql", "db.sql~").Run()
exec.Command("cp", "pf.sql", "db.sql").Run()
// open source file
s, err := os.Open("db.sql")
if err != nil {
panic(err)
}
// close fi on exit and check for its returned error
defer func() {
if err := s.Close(); err != nil {
panic(err)
}
}()
d, err := os.Create("db.sql~")
if err != nil {
println(err)
}
defer func() {
if err := d.Close(); err != nil {
panic(err)
}
}()
r := bufio.NewReaderSize(s, 4096)
// h := md5.New()
for {
// read a chunk
line, err := r.ReadString('\n')
if err != nil && err != io.EOF {
panic(err)
}
if line == "" {
fmt.Println(counter, "lines!", "done with this!")
break
}
// write a chunk
d.WriteString(re.ReplaceAllStringFunc(line, replace))
counter = counter + 1
}
}
func replace(matches string) string {
str := strings.Split(matches, ":")
str_len := len(str[2])
if str[2][str_len-3:str_len] == "\\\";" {
return str[0] + ":" + strconv.Itoa(str_len-5) + ":" + str[2]
} else if str[2][str_len-2:str_len] == "\";" {
return str[0] + ":" + strconv.Itoa(str_len-3) + ":" + str[2]
}
// Something's wrong, return what we started with
return matches
}
#!/usr/bin/perl
use strict;
use warnings;
my $src_filepath = "pf.sql";
my $dest_filepath = "db.sql";
sub fix_numbers
{
# Get the subroutine's argument.
my ($orig_num) = $_[0];
my ($string) = $_[2];
my ($statement) = $_[1] . $_[2] . $_[3];
# Get length
my $len = length $string;
if(defined($len))
{
# Got a replacement; return it.
return "s:" . $len . $statement;
}
# No replacement; return original text.
return "s:" . $orig_num . $statement;
}
sub main
{
open(my $SRC, '<', $src_filepath) or die "Can't open $src_filepath: $!";
open(my $DEST, '>', $dest_filepath ) or die "Can't open file $dest_filepath to write: $!";
while (my $line = readline($SRC)) {
## ... process the line in here
$line =~ s/s:(\d+)(:\\?\")(.*?)(\\?\";)/fix_numbers($1, $2, $3, $4)/eig;
print {$DEST} $line;
}
close($SRC);
close($DEST);
}
main();
#!/usr/bin/env python
# *** TESTS ***
#returns properly formatted string
#doesnt lose any characters
#both escaped/nonescaped quotes work
#works with quotes (esc/nonesc) within string
#returns proper #
#correctly handles escape sequences
import os, re
# Regexp to match a PHP serialized string's signature
serialized_token = re.compile(r"s:(\d+)(:\\?\")(.*?)(\\?\";)")
# Raw PHP escape sequences
escape_sequences = (r'\n', r'\r', r'\t', r'\v', r'\"', r'\.')
# Return the serialized string with the corrected string length
def _fix_serialization_instance(matches):
target_str = matches.group(3)
ts_len = len(target_str)
# PHP Serialization counts escape sequences as 1 character, so subtract 1 for each escape sequence found
esc_seq_count = 0
for sequence in escape_sequences:
esc_seq_count += target_str.count(sequence)
ts_len -= esc_seq_count
output = 's:{0}{1}{2}{3}'.format(ts_len, matches.group(2), target_str, matches.group(4))
return output
# Accepts a file or a string
# Iterate over a file in memory-safe way to correct all instances of serialized strings (dumb replacement)
def fix_serialization(file):
try:
with open(file,'r') as s:
d = open(file + "~",'w')
for line in s:
line = re.sub(serialized_token, _fix_serialization_instance, line)
d.write(line)
d.close()
s.close()
os.remove(file)
os.rename(file+'~',file)
print "file serialized"
return True
except:
# Force python to see escape sequences as part of a raw string (NOTE: Python V3 uses `unicode-escape` instead)
raw_file = file.encode('string-escape')
# Simple input test to see if the user is trying to pass a string directly
if isinstance(file,str) and re.search(serialized_token, raw_file):
output = re.sub(serialized_token, _fix_serialization_instance, raw_file)
print output
print "string serialized"
return output
else:
print "Error Occurred: Not a valid input?"
exit()
# EXAMPLES
# fix_serialization('s:2:\"http://murphy.psstudi\r\nosdev.com/wp-content/uploads/2013/03/logo-2.jpg\";')
# fix_serialization('test.txt')
# fix_serialization('texxxxt.txt')
if __name__ == "__main__":
import sys
try:
fix_serialization(sys.argv[1])
except:
print "No File specified, use `python serialize_fix.py [filename]`"
#!/usr/bin/env ruby
#
# Contributed by Mic Alexander, https://github.com/micalexander
#
Encoding.default_external = Encoding::UTF_8
Encoding.default_internal = Encoding::UTF_8
def fix_serialization file
Encoding.default_external = Encoding::UTF_8
Encoding.default_internal = Encoding::UTF_8
string = File.read file
fixed = fix_text string
open file, 'w' do |io|
io.write fixed
end
return file
end
# php escapes:
# "\\" #Backslash, '"' Double quotes, "\'" Single quotes, "\a" Bell/alert,
# "\b" Backspace, "\r" Carriage Return, "\n" New Line, "\s" Space, "\t" Tab
def fix_text string
pattern = /(s\s*:\s*)(\d+)((\s*:\\*["&])(.*?)(\\?\"\s*;))/
php_escapes = /(\\"|\\'|\\\\|\\a|\\b|\\n|\\r|\\s|\\t|\\v)/
string.gsub( pattern ) do |match|
head = $1
tail = $3
count = $5.bytesize - $5.scan(php_escapes).length
"#{head}#{count}#{tail}"
end
end
# test_1 = fix_serialization "s:12:\"robots\\.txt$\";"
# raise test_1 unless "s:12:\"robots\\.txt$\";" == test_1
fix_serialization 'kd-2013-11-01-09-27-production.sql'
a:5:{s:2:"width";i:1200;s:2:"height";i:650;s:1:"file";s:86:"2012/12/home-agave-renamed.jpg";s:5:"sizes";}
a:4:{s:1:"thumbnail";a:4:{s:2:"file";s:60:"home-agave-renamed-430x290.jpg";s:4:"width";i:430;s:6:"height";i:290;s:9:"mime-type";s:10:"image/jpeg";}s:6:"medium";a:4:{s:4:"file";s:30:"home-agave-renamed-460x249.jpg";s:5:"width";i:460;s:6:"height";i:249;s:9:"mime-type";s:10:"image/jpeg";}s:5:"large";a:4:{s:4:"file";s:30:"home-agave-renamed-900x487.jpg";s:5:"width";i:900;s:6:"height";i:487;s:9:"mime-type";s:10:"image/jpeg";}s:8:"detail-2";a:4:{s:4:"file";s:30:"home-agave-renamed-460x517.jpg";s:5:"width";i:460;s:6:"height";i:517;s:9:"mime-type";s:10:"image/jpeg";}}s:10:"image_meta";a:10:{s:8:"aperture";i:0;s:6:"credit";s:0:"";s:6:"camera";s:0:"";s:7:"caption";s:0:"";s:17:"created_timestamp";i:0;s:9:"copyright";s:0:"";s:12:"focal_length";i:0;s:3:"iso";i:0;s:13:"shutter_speed";i:0;s:5:"title";s:0:"";}}
(156,'ttrust_options','a:23:{s:11:\"ttrust_logo\";
s:68:\"http://example.com/wp-content/uploads/2013/03/logo-2.jpg\";
s:0:\"\";
s:163:\"3140 W. Buckeye Road, Phoenix, AZ 85009 | (602) 353-5435\r\n<br /><br />\r\nSupport for this website was provided by a grant from the Robert Wood Johnson Foundation.\";
s:1:\"0\";
}','yes'),(
(157,'ttrust_options','a:23:{s:11:\"ttrust_logo\";
s:71:\"http://example.com/wp-content/uploads/2013/03/logo-2.jpg\";
s:0:\"\";
s:167:\"3140 W. Buckeye Road, Phoenix, AZ 85009 | (602) 353-5435\r\n<br /><br />\r\nSupport for this website was provided by a grant from the Robert Wood Johnson Foundation.\";
s:0:\"\";
}','yes')
@saxena200
Copy link

Hey the python script is not working

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment