Skip to content

Instantly share code, notes, and snippets.

@adrianseeley
Created March 5, 2014 22:08
Show Gist options
  • Save adrianseeley/9377668 to your computer and use it in GitHub Desktop.
Save adrianseeley/9377668 to your computer and use it in GitHub Desktop.
GATO.AUTOFEATURE - automatically optimizes feature selection using weights, capable of handling 1 Million+ features, also draws pretty smears
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using System.Drawing;
using System.Threading;
namespace GATO.AUTOFEATURE
{
class Program
{
static void Main(string[] args)
{
List<TrainingCase> TrainingCases = new List<TrainingCase>();
List<TrainingCase> ValidationCases = new List<TrainingCase>();
#region IRIS
using (FileStream FS = File.OpenRead("iris.data"))
{
using (StreamReader SR = new StreamReader(FS))
{
while (!SR.EndOfStream)
{
String Line = SR.ReadLine();
String[] Parts = Line.Split(',');
List<double> Inputs = new List<double>();
for (int p = 0; p < Parts.Length - 1; p++)
{
Inputs.Add(Double.Parse(Parts[p]));
}
TrainingCases.Add(new TrainingCase(Inputs.ToArray(), Parts[Parts.Length - 1]));
}
}
}
AutoFeaturizer AF = new AutoFeaturizer(
NumberOfInputs: 4,
NumberOfClasses: 3,
MaxTailLength: 1000,
NumberOfCandidates: 1000,
TrainingCases: TrainingCases,
MaxIterations: 10000,
MaxAge: 100);
#endregion
/*
using (FileStream FS = File.OpenRead("ocr_train.data"))
{
using (StreamReader SR = new StreamReader(FS))
{
SR.ReadLine(); // fucking headers
while (!SR.EndOfStream)
{
String Line = SR.ReadLine();
String[] Parts = Line.Split(',');
List<double> Inputs = new List<double>();
for (int p = 1; p < Parts.Length; p++)
{
Inputs.Add(Double.Parse(Parts[p]));
}
TrainingCases.Add(new TrainingCase(Inputs.ToArray(), Parts[0]));
}
}
}*/
}
}
class TrainingCase
{
public double[] Inputs;
public String Class;
public TrainingCase(double[] Inputs, String Class)
{
this.Inputs = Inputs;
this.Class = Class;
}
}
class AutoFeaturizer
{
public List<FeatureConfiguration> Tail;
public FeatureConfiguration Head;
public FeatureConfiguration Best;
public List<FeatureConfiguration> Candidates;
public int NumberOfInputs;
public int NumberOfClasses;
public int MaxTailLength;
public int NumberOfCandidates;
public AutoFeaturizer(int NumberOfInputs, int NumberOfClasses, int MaxTailLength, int NumberOfCandidates, List<TrainingCase> TrainingCases, int MaxIterations, int MaxAge)
{
this.NumberOfInputs = NumberOfInputs;
this.NumberOfClasses = NumberOfClasses;
this.MaxTailLength = MaxTailLength;
this.NumberOfCandidates = NumberOfCandidates;
Tail = new List<FeatureConfiguration>();
Head = new FeatureConfiguration(NumberOfInputs, TrainingCases);
Best = new FeatureConfiguration(NumberOfInputs, TrainingCases);
Candidates = new List<FeatureConfiguration>(); for (int c = 0; c < NumberOfCandidates; c++) Candidates.Add(new FeatureConfiguration(NumberOfInputs, TrainingCases));
Head.AssessFitness(TrainingCases);
for (int i = 0; i < MaxIterations && Head.Faults > NumberOfClasses - 1; i++)
{
Console.WriteLine("i: " + i + " O: " + Head.Overlaps + " F: " + Head.Faults + "->" + (NumberOfClasses - 1) + " A: " + Head.Age + " T: " + Tail.Count);
Head.Draw("Test.png", NumberOfClasses);
Thread.Sleep(1);
double BestOverlaps = Head.Overlaps;
double BestFaults = Head.Faults;
int BestCandidateAt = -1;
for (int c = 0; c < Candidates.Count; c++)
{
Candidates[c].MutateFrom(Head);
Candidates[c].AssessFitness(TrainingCases);
if (Candidates[c].Overlaps <= BestOverlaps && Candidates[c].Faults < BestFaults)
{
BestOverlaps = Candidates[c].Overlaps;
BestFaults = Candidates[c].Faults;
BestCandidateAt = c;
}
if (Candidates[c].Overlaps <= Best.Overlaps && Candidates[c].Faults < Best.Faults)
{
Best.CloneFrom(Candidates[c]);
}
}
if (BestCandidateAt > -1)
{
Tail.Add(new FeatureConfiguration(NumberOfInputs, TrainingCases).CloneFrom(Head));
Head.CloneFrom(Candidates[BestCandidateAt]);
}
else
{
Head.Age++;
if (Head.Age > MaxAge && Tail.Count > 0)
{
Head.CloneFrom(Tail[Tail.Count - 1]);
Tail.RemoveAt(Tail.Count - 1);
}
}
}
Console.ReadLine();
}
}
class FeatureConfiguration
{
public static Random r = new Random();
public double[] Weights;
public List<ClassExpression> Smear;
public double Faults;
public double Overlaps;
public int Age;
public FeatureConfiguration(int NumberOfInputs, List<TrainingCase> TrainingCases)
{
Weights = new double[NumberOfInputs];
Smear = new List<ClassExpression>(); for (int t = 0; t < TrainingCases.Count; t++) Smear.Add(new ClassExpression());
}
public void MutateFrom(FeatureConfiguration Host)
{
for (int w = 0; w < Weights.Length; w++)
{
Weights[w] = Host.Weights[w] + ((r.NextDouble() * 0.1) - 0.05);
}
}
public FeatureConfiguration CloneFrom(FeatureConfiguration Host)
{
Host.Weights.CopyTo(Weights, 0);
for (int s = 0; s < Smear.Count; s++) Smear[s].Set(Host.Smear[s].Class, Host.Smear[s].Value);
Faults = Host.Faults;
Overlaps = Host.Overlaps;
Age = Host.Age;
return this;
}
public void AssessFitness(List<TrainingCase> TrainingCases)
{
for (int t = 0; t < TrainingCases.Count; t++)
{
double OutputSum = 0;
for (int i = 0; i < TrainingCases[t].Inputs.Length; i++) OutputSum += Weights[i] * TrainingCases[t].Inputs[i];
Smear[t].Set(TrainingCases[t].Class, OutputSum);
}
Smear.Sort((a, b) => { return a.Value.CompareTo(b.Value); });
Faults = 0;
Overlaps = 0;
String LastClass = Smear[0].Class;
for (int s = 1; s < Smear.Count; s++)
{
if (Smear[s].Class != LastClass)
{
Faults++;
if (Smear[s].Value == Smear[s - 1].Value)
{
Overlaps++;
}
LastClass = Smear[s].Class;
}
}
}
public void Draw(String Filename, int NumberOfClasses)
{
List<ClassExpression> NormalizedSmear = new List<ClassExpression>();
double HighestValue = Smear[0].Value;
double LowestValue = Smear[0].Value;
List<String> Classes = new List<string>();
for (int s = 0; s < Smear.Count; s++)
{
if (Smear[s].Value > HighestValue) HighestValue = Smear[s].Value;
if (Smear[s].Value < LowestValue) LowestValue = Smear[s].Value;
if (Classes.IndexOf(Smear[s].Class) == -1) Classes.Add(Smear[s].Class);
}
double ValueRange = HighestValue - LowestValue;
if (ValueRange == 0)
{
HighestValue = 1;
LowestValue = -1;
ValueRange = 2;
}
for (int s = 0; s < Smear.Count; s++)
{
NormalizedSmear.Add(new ClassExpression().Set(Smear[s].Class, (Smear[s].Value - LowestValue) / ValueRange));
}
List<Brush> BrushSet = new List<Brush>()
{
Brushes.Red,
Brushes.Green,
Brushes.Blue,
Brushes.Yellow,
Brushes.Purple,
Brushes.HotPink,
Brushes.White,
Brushes.Lime,
Brushes.Azure,
Brushes.Orange
};
float size = 1000;
float border = 100;
Bitmap b = new Bitmap((int)size, (int)size);
using (Graphics g = Graphics.FromImage(b))
{
g.Clear(Color.Black);
for (int n = 0; n < NormalizedSmear.Count; n++)
{
g.FillRectangle(BrushSet[Classes.IndexOf(NormalizedSmear[n].Class)],
((n / (float)NormalizedSmear.Count) * (size - (border * 2))) + border,
((float)NormalizedSmear[n].Value * (size - (border * 2))) + border,
5f, 5f);
}
g.DrawString("O: " + Overlaps + " F: " + Faults + "->" + (NumberOfClasses - 1), new Font(FontFamily.GenericMonospace, 25, FontStyle.Bold), Brushes.White, 10, 10);
}
b.Save(Filename);
}
}
class ClassExpression
{
public String Class;
public double Value;
public ClassExpression Set(String Class, double Value)
{
this.Class = Class;
this.Value = Value;
return this;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment