Skip to content

Instantly share code, notes, and snippets.

@gubatron
Created April 19, 2012 02:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gubatron/2418081 to your computer and use it in GitHub Desktop.
Save gubatron/2418081 to your computer and use it in GitHub Desktop.
Blast from the past
import java.util.*;
import java.io.*;
import java.util.regex.*;
import java.net.*;
final public class NewsCrawler implements Runnable {
private static int NEWS_PORT = 119;
private static String _newsServer = new String("news.telcel.net.ve");
private static String COMMAND_HEAD = new String("head");
private static String COMMAND_ARTICLE = new String("article");
private static String _currentCommand = COMMAND_HEAD;
private static int MAX_THREAD_COUNT = 500;
private static int _threadCount = 0; //currently running threads
private static String _groupName = new String("soc.culture.venezuela");
private static int _startMsgID = 0;
private static int _endMsgID = 0;
private int _myStartID;
private int _myRange;
private static Vector _emails = new Vector();
/** Regular Expressions */
private static final String LETTER = "[a-zA-Z]";
private static final String DIGIT = "[0-9]";
private static final String LETTER_DIGIT = "[0-9a-zA-Z]";
private static final String LETTER_DIGIT_HYPHEN = "(?:[0-9a-zA-Z-])";
private static final String QUOTEDSTRING = "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\"";
private static final String ATOM = "(?:[!#-'*+/-9=?A-Z^-~-]+)";
private static final String SUBDOMAIN = "(?:" + LETTER + "(?:" + LETTER_DIGIT_HYPHEN + "*" + LETTER_DIGIT + ")?)";
private static final String WORD = "(?:" + ATOM + "|" + QUOTEDSTRING + ")";
private static final String DOMAIN = "(?:" + SUBDOMAIN + "(?:[\\.]" + SUBDOMAIN + ")+)";
private static final String LOCALPART = "(?:" + WORD + "(?:[\\.]" + WORD + ")*)";
private static final String EMAIL = "(?:" + LOCALPART + "[\\@]" + DOMAIN + ")";
private static final String EMAIL_ADDRESS = "^" + EMAIL + "$";
private static final String EMAIL_ADDRESS_OPTIONAL = "^(?:" + EMAIL + "?)$";
private static Pattern mailPattern = Pattern.compile(EMAIL);
private static Pattern urlPattern = Pattern.compile("(http:[\\-+&:$_.+!*'(),A-Za-z0-9\\%/?=]*)",Pattern.CASE_INSENSITIVE);
private NewsCrawler(int startMsgID, int range) {
_myStartID = startMsgID;
_myRange = range;
}
private String getHeader(int msgID) {
try {
Socket s = new Socket(_newsServer,NEWS_PORT);
BufferedReader br = new BufferedReader(new InputStreamReader(s.getInputStream()));
OutputStream os = s.getOutputStream();
//0 .- Read Servers Salutation
br.readLine();
//1.- Send group command
os.write(new String("group " + _groupName + "\r\n").getBytes());
//2.- Read group's treat for us, yummie!
br.readLine();
//3.- Send ARTICLE command to find emails, within ARTICLE
os.write(new String(_currentCommand + " " + msgID + "\r\n").getBytes());
String buffer = new String();
buffer = br.readLine(); //There should be a line starting with 22x as an OK Response.
//if not an ok code, return null
if (!buffer.startsWith("22")) {
os.close();
os.flush();
br.close();
s.close();
return null;
}
String msg = new String();
do {
buffer = br.readLine();
msg += buffer;
} while (!buffer.equals(".") && br.ready());
os.close();
os.flush();
br.close();
s.close();
return msg;
} catch (Exception e) {
}
return null;
}
/** Given a well formed email determines if it could belong to a person or if its a newsletter email<br>
* Newsletters emails are like this:
- b6kpuu$6eoa2$1@ID-158003.news.dfncis.de; $ sign @ID-
- Might contain the word 'abuse' abuse@netnitco.netX-Abuse-and-DMCA-Info;
- wvyha.163099$HT6.170611@telenews.teleline.es; (Contain dots, and $ signs on login name)
- contain @news.
- contain word 'remove'
- @posting.google.com
- contain word NNTP
*/
public static boolean validEmail(String email) {
String[] leftSidePatterns = new String[] { "remove","abuse", ".","$"};
String[] rightSidePatterns = new String[] { "Message-ID","netUser-Agent","news1","@id","abuse","@news.","@news-","@posting.google.com","nntp","remove"};
//Fist put it all to lowercase. Patterns are put to lowercase too later.
email = email.toLowerCase();
//Split it with @
String[] parts = email.split("@");
String login = parts[0];
String domain = parts[1];
//if first half is 32 bytes long (since it'd be an email like e3faa93289023982098392098@)
if (login.length()==32) return false;
//look for words on firt half and return false if one is found
int countLPatterns = leftSidePatterns.length;
for (int i=0; i < countLPatterns; i++) {
String pat = leftSidePatterns[i].toLowerCase();
if (login.indexOf(pat) != -1) {
return false;
}
}
//look for words on second half and return false if one is found
//look for words on firt half and return false if one is found
int countRPatterns = rightSidePatterns .length;
for (int i=0; i < countRPatterns ; i++) {
String pat = rightSidePatterns [i].toLowerCase();
if (domain.indexOf(pat) != -1) {
return false;
}
}
//else return true
return true;
}
/** This will return emails if found within a given String */
public Vector getEmails(String str) {
Vector result = new Vector();
Matcher matcherEmail = mailPattern.matcher(str);
int count = 0;
while (matcherEmail.find()) {
String eml = matcherEmail.group();
result.addElement(eml);
count++;
}
return (count > 0) ? result : null;
}
public static void loadEmails() {
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(_groupName + ".txt")));
while (br.ready()) {
String email = br.readLine();
email = email.substring(0,email.length()-1);
_emails.addElement(email);
}
br.close();
} catch (Exception e) {
}
}
/** Writes down email to emails.txt. Appends. */
public void saveEmail(String email) {
try {
FileOutputStream fos = new FileOutputStream(_groupName + ".txt",true);
email += ";\r\n";
fos.write(email.getBytes());
fos.flush();
fos.close();
} catch (Exception e) {
System.out.println(".");
}
}
/** TODO */
public int getCompletePercentage() {
return 0;
}
public void run() {
_threadCount++;
for (int i=0; i < _myRange; i++) {
String headers = getHeader(i+_myStartID);
//if msg exists looks for emails, and adds them to email Vector, and write to file.
if (headers != null) {
Vector emails = getEmails(headers);
if (emails != null) {
//We see if each emails is not already on emails vector.
int count = emails.size();
for (int j=0; j < count; j++) {
String bufferEmail = (String) emails.elementAt(j);
//when its a new email, we add it to vector so its not added again, and saved to txt file.
//It should be an email which is not a NNTP email, but a person's
synchronized (_emails) {
if (!_emails.contains(bufferEmail) && validEmail(bufferEmail)) {
_emails.addElement(new String(bufferEmail));
saveEmail(bufferEmail);
System.out.print("\r\t\t\t\t\t\t\t\rFound Emails: " + _emails.size() + " Threads Running: " + _threadCount);
}
}
System.out.print("\r\t\t\t\t\t\t\t\rFound Emails: " + _emails.size() + " Threads Running: " + _threadCount);
}
}
}
}
_threadCount--;
}
private static String usage() {
String result = new String();
result = "\n";
result += "===========================================================================\n";
result+="NewsCrawler created by Angel Leon.\n";
result+="This is not freeware or shareware, you need a license to use this software\n";
result+="You can get a license contacting pedidos@wedoit4you.com\n";
result+="All rights reserved by www.wedoit4you.com - Apr 19th, 2003 - Caracas, Venezuela\n\n";
result += "NewsCrawler <group name> <command> [news server] [threads]\n";
result += "command\t-h or -a\t(Read only Headers or Complete Articles)\n\n";
result += "Example: java NewsCrawler soc.culture.usa news.myserver.com 1250\n";
result += "===========================================================================\n\n";
return result;
}
public static void main(String[] args) {
/*validEmail("b6l8e2$68q6j$1@ID-99162.news.dfncis.de");
System.exit(1);*/
if (args.length == 4) {
MAX_THREAD_COUNT = Integer.parseInt(args[3]);
}
if (args.length == 3) {
_newsServer = args[2];
}
if (args.length == 2) {
if (args[1].equals("-h") || args[1].equals("-H")) {
_currentCommand = COMMAND_HEAD;
} else if (args[1].equals("-a") || args[1].equals("-A")) {
_currentCommand = COMMAND_ARTICLE;
} else {
System.out.println("\nVerify Input Parameters");
System.out.println(usage());
return;
}
}
if (args.length >= 1) {
_groupName = args[0];
}
if (args.length < 2) {
System.out.println(usage());
return;
}
System.out.println("===========================================================================");
System.out.println("NewsCrawler created by Angel Leon.");
System.out.println("This is not freeware or shareware, you need a license to use this software");
System.out.println("You can get a license contacting pedidos@wedoit4you.com for only $US 10");
System.out.println("All rights reserved by www.wedoit4you.com - Apr 19th, 2003 - Caracas, Venezuela\n\n");
System.out.println("Group Name: " + _groupName);
System.out.println("Threads Crawling: " + MAX_THREAD_COUNT);
System.out.println("===========================================================================");
//First thing it should do, would be to load all emails from text file.
System.out.println("\nLoading email list from previous scan on " + _groupName);
loadEmails();
System.out.println(_emails.size() + " Emails loaded\n");
System.out.println("Connecting to " + _groupName + " on " + _newsServer);
try {
Socket s = new Socket(_newsServer,NEWS_PORT);
BufferedReader br = new BufferedReader(new InputStreamReader(s.getInputStream()));
OutputStream os = s.getOutputStream();
//0 .- Read Servers Salutation
br.readLine();
System.out.println("Connected to " + _newsServer);
//1.- Send group command
os.write(new String("group " + _groupName + "\r\n").getBytes());
//2.- Read group's treat for us, yummie!
String response = br.readLine();
//3.- Quit
os.write(new String("quit\r\n").getBytes());
os.flush();
os.close();
br.close();
s.close();
//4.- Get Start MsgID and End MsgID
String results[] = response.split(" ",-1);
int code = Integer.parseInt(results[0]);
if (code != 211) {
System.out.println("An error ocurred retrieving " + _groupName + " data.\nTry again later.\n");
return;
}
_startMsgID = Integer.parseInt(results[2]);
_endMsgID = Integer.parseInt(results[3]);
System.out.println("\nEstimated Emails to retrieve: " + results[1]);
System.out.println("Maximum Emails to retrieve: " + (_endMsgID-_startMsgID) + "\n");
int msgsPerThread = (_endMsgID-_startMsgID)/MAX_THREAD_COUNT;
System.out.println("Emails per Thread: " + msgsPerThread);
//Here we start all the threads.
for (int i=_startMsgID; i <= _endMsgID; i+=msgsPerThread) {
NewsCrawler nc = new NewsCrawler(i,msgsPerThread);
Thread t = new Thread(nc);
t.start();
}
} catch (Exception e) {
System.out.println("Error conecting to " + _newsServer);
System.out.println("Try Again later.\n");
}
}
}
@gubatron
Copy link
Author

I was barely learning java when I wrote this, it's so funny reading it now.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment