Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
// Learn more about F# at http://fsharp.net
// See the 'F# Tutorial' project for more help.
open System.IO
open SevenZip
open System
SevenZipCompressor.SetLibraryPath(@"path/to/7z.dll")
let dir = @"files/path/..."
let txts = Directory.GetFiles(dir + @"Test")
let sz = SevenZip.SevenZipCompressor()
let st = System.Diagnostics.Stopwatch()
sz.CompressionMethod <- CompressionMethod.Ppmd
sz.CompressionLevel <- CompressionLevel.Low
let compress2 (f:byte[]) =
use mio = new MemoryStream(f)
use m2 = new MemoryStream(f.Length * 2)
sz.CompressStream(mio, m2)
m2.GetBuffer().[0..int m2.Length - 1]
st.Start()
let compressionMap = txts |> Array.map (fun f -> Path.GetFileNameWithoutExtension f, f |> File.ReadAllBytes |> compress2) |> Map.ofArray
let compredist f1 f2 n1 n2 =
let code = compressionMap.[n1]
let code2 = compressionMap.[n2]
let fxy = Array.append f1 f2
let code3 = compress2 (fxy)
float(code3.Length - (min (code.Length) (code2.Length))) / float(max (code.Length) (code2.Length))
let nearEdges = [| for f in txts -> let n1 = Path.GetFileNameWithoutExtension f
n1,
let fbytes = File.ReadAllBytes f
txts |> Array.map (fun fname ->
let n2 = Path.GetFileNameWithoutExtension fname
Path.GetFileNameWithoutExtension fname, compredist fbytes (File.ReadAllBytes(fname)) n1 n2)
|> Array.sortBy snd|]
st.Stop()
let nearEdgesMap = nearEdges |> Map.ofArray
let pairs = nearEdgesMap |> Map.map (fun _ v -> Map.ofArray v)
printfn "%A" nearEdges
type 'a Tree =
| Node of 'a
| Branch of 'a Tree * 'a Tree
type Cluster<'a when 'a : comparison> =
| Singleton of 'a Set
| Clusters of 'a Set * 'a Tree
let completelinkage (ps:Map<'a, Map<'a,float>>) (a: 'a Set) (b:'a Set) =
a |> Set.map (fun item1 -> b |> Set.map (fun item2 -> ps.[item1].[item2])
|> Set.maxElement) //we only want the two largest pair distances)
|> Set.maxElement
let distclust ps = function
| Singleton (item), Clusters(items, _) -> completelinkage ps item items
| Clusters (items, _) , Singleton(item) -> completelinkage ps items item
| Clusters (items1, _), Clusters(items2, _) -> completelinkage ps items1 items2
| Singleton (item1) , Singleton(item2) -> ps.[item1.MaximumElement].[item2.MaximumElement]
let mergeClusters = function
| Singleton (item), Clusters(items, dendogram)
| Clusters (items, dendogram) , Singleton(item) -> Clusters(Set.union item items, Branch(dendogram, Node item.MinimumElement))
| Clusters (items1, dendogram1), Clusters(items2, dendogram2) -> Clusters(Set.union items1 items2, Branch(dendogram1, dendogram2))
| Singleton (item1) , Singleton(item2) -> Clusters(Set.union item1 item2, Branch(Node item1.MinimumElement, Node item2.MinimumElement))
let r = Random()
(*
A function that takes a cluster and a set of clusters and finds the nearest item using cluster dist functions
A function that takes a cluster and an item and calculates distance as maxdist (item, clustermember)
There is a map that holds every item and its neighbiors
If we have an item we find the closest item by looking it up in the map.
But we also need to find the closest in the cluster. So we must compare the item to a cluster
To do this we for each cluster, compare the distance to our current item
If an item is closest we add the merged 2 to the cluster stack as a branch and remove the item from actives
If a cluster is closest we merge the item to the tree, remove it from the cluster stack and add the new tree to the stack
If the next item we are looking at is a cluster we must find the closest item.
To find it in the single set we map each item to its distance from the cluster using dist clust
We also sort the cluster set by distance from current cluster
Again if the single item is the closest we merge with cluster and remove from map;
If the cluster is the closest we remove both clusters from clusterset, merge them and put them back
Recurse
Item , Item -> Pack as a Singleton
*)
// (clusterset : Map<string, string Cluster>)
let asCluster x = Singleton (set [x])
let closestinActives distances cluster (item : string Set) =
item |> Set.map (fun s -> distclust distances (asCluster s, cluster), s)
|> Set.minElement
let find points first closest (distances : Map<string, Map<string,float>>) =
let initialActives = points |> Set.ofArray |> Set.remove first
|> Set.remove closest
let rec seek (stack : string Cluster list) (actives : string Set) =
let current = stack.Head
if stack.Length = 1 && actives = Set.empty then current
else
let nextDist, next = if actives.Count = 0 then Double.MaxValue,"" else closestinActives distances current actives
if stack.Length = 1 then seek (asCluster next :: stack) (actives.Remove(next))
else let topofstack = stack.Tail.Head
let stackDist = distclust distances (topofstack, current)
if nextDist < stackDist then
seek (asCluster next :: stack) (actives.Remove(next))
else seek ((mergeClusters (current, topofstack)) :: (stack.Tail.Tail)) actives
seek [asCluster closest ; asCluster first] initialActives
let rec toGraph depth = function
| Node(x) -> x, "", " node\r\n [\r\n id\t\""+x+"\"\r\n label\t\"" + x + "\"\r\n ]\r\n"
| Branch(ltree,rtree) -> let lname, lgraph, names1 = toGraph (depth + 1) ltree
let rname, rgraph, names2 = toGraph (depth + 1) rtree
let name = string (r.Next(0, int(2. ** (float depth + 9.))) )
name, sprintf "%s\r\n%s\r\n edge\r\n [\r\n source\t\"%s\"\r\n target\t\"%s\"\r\n ]\r\n edge\r\n [\r\n source\t\"%s\"\r\n target\t\"%s\"\r\n ]"
lgraph rgraph name lname name rname,
(sprintf " node\r\n [\r\n id\t\"%s\"\r\n label\t\"\"\r\n ]\r\n" name) + names1 + names2
let first = fst nearEdges.[r.Next(0,txts.Length)]
let closest = fst nearEdgesMap.[first].[1]
let items, fcluster = (function | Clusters(leset, letree) -> leset, letree) (find (nearEdges |> Array.map fst) first closest pairs)
let _, outgraph, nodes = toGraph 0 fcluster
let n = "graph [" + nodes + outgraph + "]"
File.WriteAllText("mbook.gml", n)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.