Created
March 29, 2009 14:09
-
-
Save dizzi/87408 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package cz.dizzi; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
public class UniqueFiles { | |
private int counter = 0; | |
private class Node { | |
public HashMap<Long, Node> ChildNodes = new HashMap<Long, Node>(); | |
} | |
private static HashMap<Long, Node> root = new HashMap<Long, Node>(); | |
static ArrayList<Long> tokenize(String line) { | |
String[] sarray = line.split(" "); | |
ArrayList<Long> al = new ArrayList<Long>(); | |
for (String t : sarray) { | |
if (t.startsWith("\"")) | |
continue; | |
try { | |
al.add(Long.parseLong(t, 16)); | |
} catch (Exception e) { | |
System.out.println("Expected exception string: " + t); | |
} | |
} | |
return al; | |
} | |
void processToken(HashMap<Long, Node> cRoot, ArrayList<Long> queue, int index) { | |
// boundaries check | |
if (queue.size() <= index) { | |
return; | |
} | |
Long token = queue.get(index); | |
if (!cRoot.containsKey(token)){ | |
cRoot.put(token, new Node()); | |
}else if((queue.size()-1)==index && cRoot.get(token).ChildNodes.size()==0){ //precount identical rows | |
counter++; | |
} | |
processToken(cRoot.get(token).ChildNodes, queue, ++index); | |
} | |
int countLeafs(HashMap<Long, Node> cRoot){ | |
for(Long key : cRoot.keySet()){ | |
if(cRoot.get(key).ChildNodes.size()==0) | |
counter++; | |
countLeafs(cRoot.get(key).ChildNodes); | |
} | |
return counter; | |
} | |
public static void main(String[] args) throws Exception { | |
BufferedReader in = new BufferedReader(new FileReader("d:/set_small.dat3")); | |
UniqueFiles uf = new UniqueFiles(); | |
int cnt = 0; | |
System.out.println("Building tree..."); | |
long time = System.currentTimeMillis(); | |
// Thread.sleep(10000); | |
while (true) { | |
String line = in.readLine(); | |
if (line == null) | |
break; | |
// System.out.println(line); | |
uf.processToken(root, UniqueFiles.tokenize(line), 0); | |
if (cnt % 1000 == 0) | |
System.out.print(cnt); | |
else if (cnt % 100 == 0) | |
System.out.print("."); | |
cnt++; | |
} | |
System.out.println("."+cnt); | |
System.out.println("Counting leafs..."); | |
int total = uf.countLeafs(root); | |
System.out.println("Running time: " + (System.currentTimeMillis() - time) + "ms"); | |
System.out.println("Diff lines: " + total); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Building tree... | |
0.........1000.........2000.........3000.........4000.........5000.........6000.........7000.........8000.........9000.........10000.........11000.........12000.........13000.........14000.........15000.........16000.........17000.........18000.........19000.........20000.........21000.........22000.........23000.........24000.........25000.........26000.........27000.........28000.........29000.........30000.........31000.........32000.........33000.........34000.........35000.........36000.........37000.........38000.........39000.........40000.........41000.........42000.........43000.........44000.........45000.........46000.........47000.........48000.........49000.........50000.........51000.........52000.........53000.........54000.........55000.........56000.........57000.........58000.........59000.........60000.........61000.........62000.........63000.........64000.........65000.........66000.........67000.........68000.........69000.........70000.........71000.........72000.........73000.........74000.........75000.........76000.........77000.........78000.........79000.........80000.........81000.........82000.........83000.........84000.........85000.........86000.........87000.........88000.........89000.........90000.........91000.........92000.........93000.........94000.........95000.........96000.........97000.........98000.........99000.........100000.........101000.........102000.........103000.........104000.........105000.........106000.106020 | |
Counting leafs... | |
Running time: 19657ms | |
Diff lines: 96312 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment