Created
April 19, 2012 02:58
-
-
Save gubatron/2418081 to your computer and use it in GitHub Desktop.
Blast from the past
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.*; | |
import java.io.*; | |
import java.util.regex.*; | |
import java.net.*; | |
final public class NewsCrawler implements Runnable { | |
private static int NEWS_PORT = 119; | |
private static String _newsServer = new String("news.telcel.net.ve"); | |
private static String COMMAND_HEAD = new String("head"); | |
private static String COMMAND_ARTICLE = new String("article"); | |
private static String _currentCommand = COMMAND_HEAD; | |
private static int MAX_THREAD_COUNT = 500; | |
private static int _threadCount = 0; //currently running threads | |
private static String _groupName = new String("soc.culture.venezuela"); | |
private static int _startMsgID = 0; | |
private static int _endMsgID = 0; | |
private int _myStartID; | |
private int _myRange; | |
private static Vector _emails = new Vector(); | |
/** Regular Expressions */ | |
private static final String LETTER = "[a-zA-Z]"; | |
private static final String DIGIT = "[0-9]"; | |
private static final String LETTER_DIGIT = "[0-9a-zA-Z]"; | |
private static final String LETTER_DIGIT_HYPHEN = "(?:[0-9a-zA-Z-])"; | |
private static final String QUOTEDSTRING = "\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""; | |
private static final String ATOM = "(?:[!#-'*+/-9=?A-Z^-~-]+)"; | |
private static final String SUBDOMAIN = "(?:" + LETTER + "(?:" + LETTER_DIGIT_HYPHEN + "*" + LETTER_DIGIT + ")?)"; | |
private static final String WORD = "(?:" + ATOM + "|" + QUOTEDSTRING + ")"; | |
private static final String DOMAIN = "(?:" + SUBDOMAIN + "(?:[\\.]" + SUBDOMAIN + ")+)"; | |
private static final String LOCALPART = "(?:" + WORD + "(?:[\\.]" + WORD + ")*)"; | |
private static final String EMAIL = "(?:" + LOCALPART + "[\\@]" + DOMAIN + ")"; | |
private static final String EMAIL_ADDRESS = "^" + EMAIL + "$"; | |
private static final String EMAIL_ADDRESS_OPTIONAL = "^(?:" + EMAIL + "?)$"; | |
private static Pattern mailPattern = Pattern.compile(EMAIL); | |
private static Pattern urlPattern = Pattern.compile("(http:[\\-+&:$_.+!*'(),A-Za-z0-9\\%/?=]*)",Pattern.CASE_INSENSITIVE); | |
private NewsCrawler(int startMsgID, int range) { | |
_myStartID = startMsgID; | |
_myRange = range; | |
} | |
private String getHeader(int msgID) { | |
try { | |
Socket s = new Socket(_newsServer,NEWS_PORT); | |
BufferedReader br = new BufferedReader(new InputStreamReader(s.getInputStream())); | |
OutputStream os = s.getOutputStream(); | |
//0 .- Read Servers Salutation | |
br.readLine(); | |
//1.- Send group command | |
os.write(new String("group " + _groupName + "\r\n").getBytes()); | |
//2.- Read group's treat for us, yummie! | |
br.readLine(); | |
//3.- Send ARTICLE command to find emails, within ARTICLE | |
os.write(new String(_currentCommand + " " + msgID + "\r\n").getBytes()); | |
String buffer = new String(); | |
buffer = br.readLine(); //There should be a line starting with 22x as an OK Response. | |
//if not an ok code, return null | |
if (!buffer.startsWith("22")) { | |
os.close(); | |
os.flush(); | |
br.close(); | |
s.close(); | |
return null; | |
} | |
String msg = new String(); | |
do { | |
buffer = br.readLine(); | |
msg += buffer; | |
} while (!buffer.equals(".") && br.ready()); | |
os.close(); | |
os.flush(); | |
br.close(); | |
s.close(); | |
return msg; | |
} catch (Exception e) { | |
} | |
return null; | |
} | |
/** Given a well formed email determines if it could belong to a person or if its a newsletter email<br> | |
* Newsletters emails are like this: | |
- b6kpuu$6eoa2$1@ID-158003.news.dfncis.de; $ sign @ID- | |
- Might contain the word 'abuse' abuse@netnitco.netX-Abuse-and-DMCA-Info; | |
- wvyha.163099$HT6.170611@telenews.teleline.es; (Contain dots, and $ signs on login name) | |
- contain @news. | |
- contain word 'remove' | |
- @posting.google.com | |
- contain word NNTP | |
*/ | |
public static boolean validEmail(String email) { | |
String[] leftSidePatterns = new String[] { "remove","abuse", ".","$"}; | |
String[] rightSidePatterns = new String[] { "Message-ID","netUser-Agent","news1","@id","abuse","@news.","@news-","@posting.google.com","nntp","remove"}; | |
//Fist put it all to lowercase. Patterns are put to lowercase too later. | |
email = email.toLowerCase(); | |
//Split it with @ | |
String[] parts = email.split("@"); | |
String login = parts[0]; | |
String domain = parts[1]; | |
//if first half is 32 bytes long (since it'd be an email like e3faa93289023982098392098@) | |
if (login.length()==32) return false; | |
//look for words on firt half and return false if one is found | |
int countLPatterns = leftSidePatterns.length; | |
for (int i=0; i < countLPatterns; i++) { | |
String pat = leftSidePatterns[i].toLowerCase(); | |
if (login.indexOf(pat) != -1) { | |
return false; | |
} | |
} | |
//look for words on second half and return false if one is found | |
//look for words on firt half and return false if one is found | |
int countRPatterns = rightSidePatterns .length; | |
for (int i=0; i < countRPatterns ; i++) { | |
String pat = rightSidePatterns [i].toLowerCase(); | |
if (domain.indexOf(pat) != -1) { | |
return false; | |
} | |
} | |
//else return true | |
return true; | |
} | |
/** This will return emails if found within a given String */ | |
public Vector getEmails(String str) { | |
Vector result = new Vector(); | |
Matcher matcherEmail = mailPattern.matcher(str); | |
int count = 0; | |
while (matcherEmail.find()) { | |
String eml = matcherEmail.group(); | |
result.addElement(eml); | |
count++; | |
} | |
return (count > 0) ? result : null; | |
} | |
public static void loadEmails() { | |
try { | |
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(_groupName + ".txt"))); | |
while (br.ready()) { | |
String email = br.readLine(); | |
email = email.substring(0,email.length()-1); | |
_emails.addElement(email); | |
} | |
br.close(); | |
} catch (Exception e) { | |
} | |
} | |
/** Writes down email to emails.txt. Appends. */ | |
public void saveEmail(String email) { | |
try { | |
FileOutputStream fos = new FileOutputStream(_groupName + ".txt",true); | |
email += ";\r\n"; | |
fos.write(email.getBytes()); | |
fos.flush(); | |
fos.close(); | |
} catch (Exception e) { | |
System.out.println("."); | |
} | |
} | |
/** TODO */ | |
public int getCompletePercentage() { | |
return 0; | |
} | |
public void run() { | |
_threadCount++; | |
for (int i=0; i < _myRange; i++) { | |
String headers = getHeader(i+_myStartID); | |
//if msg exists looks for emails, and adds them to email Vector, and write to file. | |
if (headers != null) { | |
Vector emails = getEmails(headers); | |
if (emails != null) { | |
//We see if each emails is not already on emails vector. | |
int count = emails.size(); | |
for (int j=0; j < count; j++) { | |
String bufferEmail = (String) emails.elementAt(j); | |
//when its a new email, we add it to vector so its not added again, and saved to txt file. | |
//It should be an email which is not a NNTP email, but a person's | |
synchronized (_emails) { | |
if (!_emails.contains(bufferEmail) && validEmail(bufferEmail)) { | |
_emails.addElement(new String(bufferEmail)); | |
saveEmail(bufferEmail); | |
System.out.print("\r\t\t\t\t\t\t\t\rFound Emails: " + _emails.size() + " Threads Running: " + _threadCount); | |
} | |
} | |
System.out.print("\r\t\t\t\t\t\t\t\rFound Emails: " + _emails.size() + " Threads Running: " + _threadCount); | |
} | |
} | |
} | |
} | |
_threadCount--; | |
} | |
private static String usage() { | |
String result = new String(); | |
result = "\n"; | |
result += "===========================================================================\n"; | |
result+="NewsCrawler created by Angel Leon.\n"; | |
result+="This is not freeware or shareware, you need a license to use this software\n"; | |
result+="You can get a license contacting pedidos@wedoit4you.com\n"; | |
result+="All rights reserved by www.wedoit4you.com - Apr 19th, 2003 - Caracas, Venezuela\n\n"; | |
result += "NewsCrawler <group name> <command> [news server] [threads]\n"; | |
result += "command\t-h or -a\t(Read only Headers or Complete Articles)\n\n"; | |
result += "Example: java NewsCrawler soc.culture.usa news.myserver.com 1250\n"; | |
result += "===========================================================================\n\n"; | |
return result; | |
} | |
public static void main(String[] args) { | |
/*validEmail("b6l8e2$68q6j$1@ID-99162.news.dfncis.de"); | |
System.exit(1);*/ | |
if (args.length == 4) { | |
MAX_THREAD_COUNT = Integer.parseInt(args[3]); | |
} | |
if (args.length == 3) { | |
_newsServer = args[2]; | |
} | |
if (args.length == 2) { | |
if (args[1].equals("-h") || args[1].equals("-H")) { | |
_currentCommand = COMMAND_HEAD; | |
} else if (args[1].equals("-a") || args[1].equals("-A")) { | |
_currentCommand = COMMAND_ARTICLE; | |
} else { | |
System.out.println("\nVerify Input Parameters"); | |
System.out.println(usage()); | |
return; | |
} | |
} | |
if (args.length >= 1) { | |
_groupName = args[0]; | |
} | |
if (args.length < 2) { | |
System.out.println(usage()); | |
return; | |
} | |
System.out.println("==========================================================================="); | |
System.out.println("NewsCrawler created by Angel Leon."); | |
System.out.println("This is not freeware or shareware, you need a license to use this software"); | |
System.out.println("You can get a license contacting pedidos@wedoit4you.com for only $US 10"); | |
System.out.println("All rights reserved by www.wedoit4you.com - Apr 19th, 2003 - Caracas, Venezuela\n\n"); | |
System.out.println("Group Name: " + _groupName); | |
System.out.println("Threads Crawling: " + MAX_THREAD_COUNT); | |
System.out.println("==========================================================================="); | |
//First thing it should do, would be to load all emails from text file. | |
System.out.println("\nLoading email list from previous scan on " + _groupName); | |
loadEmails(); | |
System.out.println(_emails.size() + " Emails loaded\n"); | |
System.out.println("Connecting to " + _groupName + " on " + _newsServer); | |
try { | |
Socket s = new Socket(_newsServer,NEWS_PORT); | |
BufferedReader br = new BufferedReader(new InputStreamReader(s.getInputStream())); | |
OutputStream os = s.getOutputStream(); | |
//0 .- Read Servers Salutation | |
br.readLine(); | |
System.out.println("Connected to " + _newsServer); | |
//1.- Send group command | |
os.write(new String("group " + _groupName + "\r\n").getBytes()); | |
//2.- Read group's treat for us, yummie! | |
String response = br.readLine(); | |
//3.- Quit | |
os.write(new String("quit\r\n").getBytes()); | |
os.flush(); | |
os.close(); | |
br.close(); | |
s.close(); | |
//4.- Get Start MsgID and End MsgID | |
String results[] = response.split(" ",-1); | |
int code = Integer.parseInt(results[0]); | |
if (code != 211) { | |
System.out.println("An error ocurred retrieving " + _groupName + " data.\nTry again later.\n"); | |
return; | |
} | |
_startMsgID = Integer.parseInt(results[2]); | |
_endMsgID = Integer.parseInt(results[3]); | |
System.out.println("\nEstimated Emails to retrieve: " + results[1]); | |
System.out.println("Maximum Emails to retrieve: " + (_endMsgID-_startMsgID) + "\n"); | |
int msgsPerThread = (_endMsgID-_startMsgID)/MAX_THREAD_COUNT; | |
System.out.println("Emails per Thread: " + msgsPerThread); | |
//Here we start all the threads. | |
for (int i=_startMsgID; i <= _endMsgID; i+=msgsPerThread) { | |
NewsCrawler nc = new NewsCrawler(i,msgsPerThread); | |
Thread t = new Thread(nc); | |
t.start(); | |
} | |
} catch (Exception e) { | |
System.out.println("Error conecting to " + _newsServer); | |
System.out.println("Try Again later.\n"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I was barely learning java when I wrote this, it's so funny reading it now.