Skip to content

Instantly share code, notes, and snippets.

@shimondoodkin
Created April 28, 2015 23:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shimondoodkin/48097c83097106e4dd72 to your computer and use it in GitHub Desktop.
Save shimondoodkin/48097c83097106e4dd72 to your computer and use it in GitHub Desktop.
steaming Frequent Terms from Keywords. basic algorithm
package com.nini;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import com.nini.Repl.Keywords.Pair;
public class Repl {
public static void main(String[] args) {
System.out.println("start");
Keywords terms_extractor=new Keywords();
String [] text="1 1 2 1 1".split("\\s+");
for(String s :text)
{
System.out.println("add word "+s);
terms_extractor.AddWord(s);
}
/*
System.out.println("result");
ArrayList<Keywords.Pair> result=terms_extractor.extractNoDuplicates();
for(Keywords.Pair p :result)
{
System.out.println(p.ch+" = "+p.count);
}
*/
System.out.println("end");
}
static class Keywords
{
/*
* Architecture:
* trail is the current nwords list.
* prevtrail simulates the trail as it was nwords before.
*
* */
public int nwords=5;
private LinkedList<String> trail=new LinkedList<String>();
private LinkedList<String> prevtrail=new LinkedList<String>();
public HashMap<String,Short> counter=new HashMap<String,Short> ();
public HashMap<String,Short> tempcounter=new HashMap<String,Short> ();
public Keywords()
{
reset();
}
public void reset()
{
trail.clear();
prevtrail.clear();
counter.clear();
//run=0;
}
//private int run=0;
private String prevword=null;
private String prevsequence=null;
public void AddWord(String word)
{
//run++;
String sequence="";
if(prevword!=null)
{
prevtrail.addLast(prevword);
}
if(trail.size()==nwords)
{
trail.removeLast();
if(prevtrail.size()>nwords)prevtrail.removeFirst();
//printList("prevtrail"+run,prevtrail);
sequence="";
//for(int i=trail.size()-1;i>=0;i--)
tempcounter.clear();
for(String ch : prevtrail)
{
//String ch=trail.get(i);
sequence=sequence+ch+",";
//System.out.println("remove sequence="+sequence);
if(counter.containsKey(sequence))
{
short count=counter.get(sequence).shortValue();
if(count==1)
counter.remove(sequence);
else
{
if(tempcounter.containsKey(sequence)) System.out.println("error remove same key again: "+ch);
tempcounter.put(sequence,--count);
}
}
else
System.out.println("error removing not exist key: "+ch);
// counter.put(sequence,(short) 1);
}
for(Entry<String, Short> ch : tempcounter.entrySet())
{
counter.put(ch.getKey(),ch.getValue());
}
}
trail.addFirst(word);
prevword=word;
//printList("trail"+run,trail);
sequence="";
//for(int i=trail.size()-1;i>=0;i--)
tempcounter.clear();
for(String ch : trail)
{
//String ch=trail.get(i);
sequence=ch+","+sequence;
//System.out.println("sequence="+sequence);
if(counter.containsKey(sequence))
{
short count=counter.get(sequence).shortValue();
if(tempcounter.containsKey(sequence)) System.out.println("error add same key again: "+ch);
tempcounter.put(sequence,++count);
}
else
counter.put(sequence,(short) 1);
}
for(Entry<String, Short> ch : tempcounter.entrySet())
{
counter.put(ch.getKey(),ch.getValue());
}
prevsequence=sequence;
printListPair("result",extractNoDuplicates());
System.out.println("");
}
public static class DistanceString implements Comparable<DistanceString> {
public DistanceString(String value){this.value=value;}
public String value;
@Override
public int compareTo(DistanceString another) {
return value.compareTo(another.value);
}
}
public static class Pair {
public Pair(String c, short n){ch=c;count=n;}
public String ch;
public short count;
}
private Comparator<Pair> abc_sort= new Comparator<Pair>(){
public int compare(Pair s1, Pair s2) {
return s1.ch.compareTo(s2.ch);
}
};
private Comparator<Pair> length_sort= new Comparator<Pair>(){
public int compare(Pair s1, Pair s2) {
final int l1=s1.ch.length();
final int l2=s1.ch.length();
if (l1<l2) return -1;
if (l1>l2) return 1;
//if (l1==l2) return 0;
return 0;
}
};
public ArrayList<Pair> extractNoDuplicates() {return extractNoDuplicates(1,(short) 1);}
public ArrayList<Pair> extractNoDuplicates(int minwords,short minrepetition)
{
ArrayList<Pair> terms=new ArrayList<Pair> ();
for(Entry<String, Short> p :counter.entrySet())
{
terms.add(new Pair(p.getKey(), p.getValue()));
}
Collections.sort(terms,abc_sort);
Collections.sort(terms,length_sort);
for(int i=0;i<terms.size()-1;)
{
Pair pprev = terms.get(i);
Pair pnext = terms.get(i+1);
if(pnext.ch.startsWith(pprev.ch))
{
if(pprev.count==pnext.count)
terms.remove(i);
else
// {
// pprev.count=(short) (pprev.count-(pnext.count+1));
i++;
// }
}
// maybe without length sort Collections.sort(terms,length_sort);
// else if(pprev.ch.startsWith(pnext.ch))
// {
// if(pprev.count==pnext.count)
// terms.remove(i+1);
// else
// {
// pnext.count=(short) (pnext.count-pprev.count);
// i+=2;
// }
// }
else
i++;
}
for(int i=terms.size()-1;i>=0;i--)
{
Pair pprev = terms.get(i);
if(pprev.count<=minrepetition || pprev.ch.split(",").length<=minwords || (!prevsequence.endsWith(pprev.ch)))terms.remove(i);
}
return terms;
}
}
public static void printList(String name, List<String> list){
String o="";
boolean first=true;
for(String elem : list){
if(first)first=false; else o+=", ";
o+=elem;
}
System.out.println(name+"=["+o+"]");
}
public static void printListPair(String name, List<Pair> list){
boolean first=true;
System.out.println(name+"=[");
for(Pair p :list)
{
System.out.println(p.ch+" = "+p.count+(first?"":","));
if(first)first=false;
}
System.out.println("]");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment