Skip to content

Instantly share code, notes, and snippets.

@uintdev
Last active April 22, 2023 16:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uintdev/e846618677a5c7646660cd28bcbdee8d to your computer and use it in GitHub Desktop.
Save uintdev/e846618677a5c7646660cd28bcbdee8d to your computer and use it in GitHub Desktop.
Email extractor but in C++ (test port)
/**
* Last modified: 2nd April 2019
* This was a test port from Python to see if C++ would trivally
* improve performance. In this case, the implementation resulted
* in far worse file extraction times in comparison to
* the Python counterpart.
* Of course, the Rust port, of which came out long after, had easily bet both.
*
* Presented is the snapshot of the last modification,
* with comments and what was commented out.
*/
#include "stdafx.h"
#include <iostream>
#include <fstream>
#include <sstream>
#include <stdio.h>
#include <locale>
#include <string>
#include <vector>
#include <regex>
std::vector<std::string> emailList;
bool extract(const std::string inputFile) {
std::cout << "Reading emails from '" << inputFile << "'..." << std::endl;
std::streampos filesizei;
unsigned int emailsfound = 0;
std::ifstream file(inputFile, std::ios_base::binary); // attempt to read file
if (file.is_open()) {
std::cout << "File found." << std::endl;
file.seekg(0, std::ifstream::end);
filesizei = file.tellg();
file.seekg(0, std::ifstream::beg);
char* buffer = new char[filesizei]; // malloc() instead ?
std::cout << "Allocated " << filesizei << " bytes.\n"
<< "Storing file content in buffer..."
<< std::endl;
if (file.read(buffer, filesizei)) {
std::cout << "Stored :: " << strlen(buffer) << "\n"
<< "BUFFER DATA :: " << buffer[0]
<< std::endl;
file.close(); // close file
}
else {
std::cout << "Unable to store file contents. Halting." << std::endl;
return false;
}
/*
ifstream file(inputFile, ios_base::binary); // attempt to read file
if (!file) {
cout << "Unable to read file."; // error reading file
return 1;
}
const int iobuf = 256 * 1024;
char buff[iobuf];
file.rdbuf()->pubsetbuf(buff, sizeof iobuf);
string fileline;
*/
std::string fline;
std::istringstream filedat(buffer);
//cout << "LINE :: " << buffer << "\n";
//cout << "Extracting emails from file content...\n";
while (std::getline(filedat, fline)) {
// begin regex
//cout << "LINE :: " << memblock << "\n";
//cout << "LINE2" << fline << "\n";
std::string subject(fline);
try {
std::regex re("[a-zA-Z0-9][a-zA-Z0-9_.]+@[a-zA-Z0-9_]+\\.[a-zA-Z0-9_.]+");
std::sregex_iterator next(subject.begin(), subject.end(), re);
std::sregex_iterator end;
while (next != end) {
std::smatch match = *next;
emailList.push_back(match.str()); // add to array
emailsfound++;
next++;
std::cout << emailsfound << " emails found\r";
// TODO: correct issue regarding the referencing of 'next'
//unsigned int nextcut = next - 1;
//if (next != nextcut) {
std::cout.flush();
//}
}
}
catch (std::regex_error& e) {
(void)e;
std::cout << "Error: malformed regex pattern. Halting." << std::endl;
return false;
}
}
if (emailList.size() == 0) {
std::cout << "No emails found. Halting." << std::endl;
return false;
}
std::cout << "Changing email order..." << std::endl;
sort(emailList.begin(), emailList.end());
std::cout << "Removing duplicate emails (if any)..." << std::endl;
auto el = unique(emailList.begin(), emailList.end());
emailList.erase(el, emailList.end());
std::cout << emailList.size() << " email(s) loaded." << std::endl;
return true;
}
else {
std::cout << "Unable to read file." << std::endl; // error reading file
return false;
}
}
bool dump(std::string outputFile, bool forceoverwritef) {
std::ifstream existf(outputFile);
if (existf && forceoverwritef == false) {
std::string owf;
std::cout << "'" << outputFile << "' already exists. Do you want to overwrite? (y/N) ";
std::cin >> owf;
// convert input to lowercase
std::locale loc;
for (auto elem : owf) {
owf = tolower(elem, loc);
}
if (owf != "y") {
std::cout << "Halted." << std::endl;
return false;
}
}
else {
existf.close();
}
std::ofstream outputf(outputFile);
if (!outputf) {
std::cout << "Unable to write emails to file." << std::endl;
return false;
}
std::cout << "Writing email(s) to file..." << std::endl;
unsigned __int64 remainingemails = 0;
for (auto & emails : emailList) {
outputf << emails << std::endl;
++remainingemails;
std::cout << remainingemails << " out of " << emailList.size() << " written to file. " << int((unsigned __int64)remainingemails * 100 / emailList.size()) << "%\r";
std::cout.flush();
}
std::cout << std::endl;
outputf.close();
std::cout << remainingemails << " email(s) written to '" << outputFile << "'." << std::endl;
return true;
}
int main(int argc, char* argv[]) {
if (argc < 2) {
std::cout << "No arguments had been provided." << std::endl;
}
else if (argc < 3) {
std::cout << "No output file was provided." << std::endl;
}
if (argc < 3) {
std::cout << "\nSyntax: " << argv[0] << " {input file} {output file} {option}"
<< "\nOptions"
<< "\n-------------------------------------------"
<< "\n| -f | Force overwritting of output file |"
<< "\n-------------------------------------------"
<< std::endl;
return 1;
}
// defaults
bool forceoverwrite = false;
// options
if (argc > 3) {
std::string optionarg1 = argv[3];
if (optionarg1 == "-f") {
forceoverwrite = true; // force overwrite
}
}
std::cout << "EMAIL EXTRACTION TOOL" << std::endl
<< "\n\nInput file :: " << argv[1]
<< "\nOutput file :: " << argv[2]
<< std::endl;
if (forceoverwrite) {
std::cout << "Force overwrite enabled." << std::endl;
}
std::cout << "\n\n";
if (!extract(argv[1])) return 1;
dump(argv[2], forceoverwrite);
std::cout << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment