Skip to content

Instantly share code, notes, and snippets.

@simonsickle-old
Last active December 30, 2015 04:31
Show Gist options
  • Save simonsickle-old/1fbb65c1194aefb14ba5 to your computer and use it in GitHub Desktop.
Save simonsickle-old/1fbb65c1194aefb14ba5 to your computer and use it in GitHub Desktop.
A deduplicator for checking for duplicate emails in CPP
/*
* Copyright 2015 - Simon Sickle
* CSV DeDuplicator
* Finds duplicate emails and creates a list of duplicates
* usage is ./dedupe /path/to/whatever.csv /path/to/dupe.csv
*/
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstring>
#include <string>
using namespace std;
int main (int argc, const char * argv[]) {
int lineCount = 0;
int lineNum = 0;
int numDupes = 0;
int numTimesRepeat = 0;
char c;
string email = "";
// Check arguments
if (argc != 3)
{
cout << "Usage ./dedupe /path/to/a.csv /path/for/dupe.csv" << endl;
return -1;
}
// Open file
fstream textfile(argv[1], ios::in);
// Maks sure file is readable
if (textfile.good()) {
// Get line count
while (textfile.get(c))
{
if (c == '\n')
lineCount++;
}
// Print line count
cout << "There are " << lineCount << " lines." << endl;
// Create a big enough array to hold at least all the lines
string * emails = new string[lineCount];
string * dupes = new string[lineCount];
// Remove EOF flag and seek to beginning
textfile.clear();
textfile.seekg(0, ios::beg);
// Loop through each line
for (string line; getline(textfile, line); )
{
// copy headers / first line
if (lineNum == 0)
{
dupes[numDupes] = line;
numDupes++;
}
// Turn the line string into stringstream for getline
stringstream ls(line);
// Loop through line until find something with an @ in it
while(getline(ls, email,','))
{
if (email.find("@") != string::npos)
break;
}
// Add to email array
emails[lineNum] = email;
// Loop through emails array to fund amount of dupes
for (int i = 0; i <= lineNum; i++)
{
// Detect dupe
if (emails[i] == email)
{
numTimesRepeat++;
}
}
// If its only 1, then its not a dupe
if (numTimesRepeat > 1) {
dupes[numDupes] = line;
numDupes++;
}
// Reset dupe counter
numTimesRepeat = 0;
// Go to the next line
lineNum++;
}
cout << "We found " << numDupes << " duplicates." << endl;
// Write the dupes to a new csv file
fstream txtOut(argv[2], ios::out);
if (txtOut.good())
{
for (int i = 0; i < numDupes; i++)
{
txtOut << dupes[i] << endl;
}
} else {
cout << "File name to save duplicates under was not valid" << endl;
return 1;
}
// Clean up resources
textfile.close();
txtOut.close();
delete [] emails;
delete [] dupes;
// Return 0 for good in *nix
return 0;
}
cout << "Could not read the file provided" << endl;
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment