Last active
December 18, 2015 23:29
-
-
Save moorage/5862214 to your computer and use it in GitHub Desktop.
Have you ever needed to clean email addresses out of a file (like a SQL Dump) so that you don't accidentally email people in it? This script does exactly that! It turns all email addresses in the file into developers+EMAILADDRESS@example.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"os" | |
"fmt" | |
"flag" | |
"regexp" | |
"strings" | |
"log" | |
) | |
// readWholeLine returns a single line (without the ending \n) from the input buffered reader. | |
// An error is returned iff there is an error with the buffered reader. | |
func readWholeLine(r *bufio.Reader) (string, error) { | |
var (isPrefix bool = true | |
err error = nil | |
line, ln []byte | |
) | |
for isPrefix && err == nil { | |
line, isPrefix, err = r.ReadLine() | |
ln = append(ln, line...) | |
} | |
return string(ln),err | |
} | |
// ForeachLine opens the file specified in filename, and reads each line until the \n, one at a time. | |
// It then invokes the callback method (called withLine) with the string gotten from that line. | |
// The purpose of this function is to be as memory efficient as possible when operating on files with many, many lines. | |
// | |
// Example: | |
// ForeachLine("/Users/myuser/Desktop/longlonglong.log", func(line string) { | |
// exampleFunction(line) // ... do whatever I need to with line | |
// }) | |
func ForeachLine(filename string, withLine func(string)) { | |
f, err := os.Open(filename) | |
if err != nil { log.Fatal(err) } | |
r := bufio.NewReader(f) | |
s, e := readWholeLine(r) | |
for e == nil { | |
withLine(s) | |
s,e = readWholeLine(r) | |
} | |
} | |
var InputFilename string | |
var OutputFilename string | |
func init() { | |
flag.StringVar(&InputFilename, "i", "", "path to file with emails") | |
flag.StringVar(&OutputFilename, "o", "", "path to output file") | |
} | |
func main() { | |
flag.Parse() | |
if len(InputFilename) == 0 { | |
log.Fatalf("Please specify a InputFilename with -i (got: empty string)") | |
} | |
if len(OutputFilename) == 0 { | |
log.Fatalf("Please specify a OutputFilename with -o (got: empty string)") | |
} | |
log.Printf("Reading in from file `%s`\n", InputFilename) | |
log.Printf("Outputting to file `%s`\n", OutputFilename) | |
// open output file | |
fo, err := os.Create(OutputFilename) | |
if err != nil { panic(err) } | |
// make a write buffer | |
w := bufio.NewWriter(fo) | |
re := regexp.MustCompile(`[A-Za-z0-9\.\_\%\-\+\!]+@[A-Za-z0-9\.\-]+\.[A-Za-z]{2,4}`) | |
matchingLineCount := 0 | |
lineCount := 0 | |
ForeachLine(InputFilename, func(line string) { | |
sanitizedLine := line | |
emails := re.FindAllString(line, -1) | |
if len(emails) > 0 { | |
matchingLineCount++ | |
// fmt.Printf("Found an email on line %d (matching lines to date: %d)\n", lineCount, matchingLineCount) | |
} | |
for _, email := range emails { | |
// fmt.Print("Found: " + email + "\n") | |
sanitizedEmail := "developers+" + strings.Replace(email, "@", "_AT_", -1) + "@thrivesmart.com" | |
// fmt.Print(" Replacing with: " + sanitizedEmail + "\n") | |
sanitizedLine = strings.Replace(sanitizedLine, email, sanitizedEmail, -1) | |
} | |
if lineCount >= 35084600 { | |
fmt.Printf("Writing out line %d\n", lineCount) | |
fmt.Print(" =>|"+sanitizedLine+"\n") | |
} | |
if _, err := w.WriteString(sanitizedLine+"\n"); err != nil { | |
fo.Close() | |
panic(err) | |
} | |
lineCount++ | |
}) | |
if err = w.Flush(); err != nil { | |
fo.Close() | |
panic(err) | |
} | |
fmt.Printf("FINISHED on line %d\n", lineCount) | |
if err = fo.Close(); err != nil { | |
panic(err) | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment