Created
December 28, 2014 14:24
-
-
Save threecourse/d69b82a54241afdeccb0 to your computer and use it in GitHub Desktop.
Clustering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import numpy as np | |
# 以下記事内のRコードをPythonに移植 | |
# https://www.soa.org/Library/Newsletters/Compact/2009/July/com-2009-iss32.pdf | |
# Pythonでは、演算メインであれば、pandasよりもnumpyで記述するのが扱いやすい | |
# インプット情報 | |
# セル6個・ロケーションパラメータ3個 | |
loc0 = np.array([[23.0, 15.0, 13.0], | |
[10.0, 20.0, 30.0], | |
[24.0, 15.0, 13.0], | |
[10.0, 26.0, 30.0], | |
[25.0, 15.0, 13.0], | |
[10.0, 20.0, 31.0]]) #(6,3) | |
siz = np.array([49.0, 25.0, 5.0, 50.0, 50.0, 100.0]).reshape((6,1)) | |
segments = np.array([0, 1, 0, 1, 0, 1]) #(6,) | |
weights = np.array([1, 1, 10]) #(3,) | |
# 最終的に3個までクラスタリングする | |
endcells = 3 | |
elements = len(siz) | |
mapping_iters = elements - endcells | |
# ロケーションパラメータを正規化し、ウェイトを乗じている。 | |
# 平均・分散はサイズを考慮して計算する。 | |
def get_loc(_loc0, _siz, _weights): | |
loc1 = _siz * _loc0 #(6,3) | |
_avg = loc1.sum(axis=0) / sum(_siz) #(3,) | |
_vars = ((loc1*loc1)/_siz).sum(axis=0) / sum(_siz) - (_avg * _avg) #(3,) | |
_sigma = np.sqrt(_vars) #(3,) | |
return _weights / _sigma * _loc0 #(6,3) | |
loc = get_loc(loc0, siz, weights) | |
# セルの組み合わせを作成する | |
def get_pairs(): | |
listel = range(0,elements) | |
pair_from = np.array(listel * elements) | |
pair_to = np.sort(pair_from) | |
use = (segments[pair_from] == segments[pair_to]) & (pair_from != pair_to) | |
pair_from = pair_from[use] | |
pair_to = pair_to[use] | |
return pair_from, pair_to | |
pair_from, pair_to = get_pairs() | |
# セルの組み合わせごとにロケーションパラメータの距離を作成する | |
def get_pair_dist(): | |
diff = loc[pair_from,]-loc[pair_to,] | |
return np.sqrt((diff*diff).sum(axis=1)) # ロケーションパラメータの差の2乗の和の平方根 | |
pair_dist = get_pair_dist() | |
# セルの情報を作成する | |
cell_size = siz.reshape(6,) #(6,) セルのサイズ | |
cell_dest = np.array(range(0,elements)) #(6,) セルのマッピング先 | |
# 指定したセル数になるまでマッピングを繰り返す | |
for tt in range(0, mapping_iters): | |
# importanceが最も小さいセルの組み合わせを求める | |
importance = cell_size[pair_from] * pair_dist | |
index = np.argsort(importance)[0] | |
print index | |
# importanceが最も小さい組み合わせについて、マッピングを行う | |
mapfrom = pair_from[index] | |
mapto = pair_to[index] | |
print [mapfrom, mapto] | |
# セルの情報を修正する(マッピング元をマッピング先に変更する) | |
cell_dest[cell_dest == mapfrom] = mapto | |
cell_size[mapto] = cell_size[mapto] + cell_size[mapfrom] | |
cell_size[mapfrom] = 0 | |
# セルの組み合わせの情報を修正する(マッピングされたセルを除去) | |
keep = (cell_dest[pair_from]==pair_from) & (cell_dest[pair_to]==pair_to) | |
pair_from = pair_from[keep] | |
pair_to = pair_to[keep] | |
pair_dist = pair_dist[keep] | |
# マッピング先のリストが結果となる | |
print cell_dest | |
# クラスタリングされたセルの中央に最も近い、"most representative policy"を選ぶことはまだ行っていない。 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.IO; | |
using C5; // http://www.itu.dk/research/c5/ | |
/* | |
* クラスタリングを行うプログラム | |
* ロジックは以下記事内のRコードと同様 | |
* https://www.soa.org/Library/Newsletters/Compact/2009/July/com-2009-iss32.pdf | |
* | |
* 素朴な方法で実装しているため、計算量はO(n^2 * logn), メモリ使用量はO(n^2)となり、特にメモリ使用量は厳しい。 | |
*/ | |
namespace Clustering | |
{ | |
// 設定項目 | |
static class Settings | |
{ | |
public static string inpath = @"data_n1000.txt"; | |
public static string outpath = @"clsdata_n1000.txt"; | |
public static int endcells = 50; | |
public static double[] weights = new double[] { 1, 1 }; | |
public static int paramLength { get { return weights.Length; } } // パラメータの数 | |
} | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch(); | |
sw.Start(); | |
Clustering cls = new Clustering(); | |
cls.Run(); | |
sw.Stop(); | |
Console.WriteLine(sw.Elapsed); | |
Console.Read(); | |
} | |
} | |
class Clustering | |
{ | |
public void Run() | |
{ | |
Cell[] cells = initializeCells1(); | |
initializeCells2(cells); | |
initializeCells3(cells); | |
Console.WriteLine("Initialized Cells"); | |
runClustering(cells); | |
Console.WriteLine("Clustered Cells"); | |
setRepresentCell(cells); | |
Console.WriteLine("Set Represent Cells"); | |
WriteClusteredCells(cells); | |
} | |
// 初期化 Step1. インプットの読み込み | |
private Cell[] initializeCells1() | |
{ | |
int count = 0; | |
List<Cell> cells = new List<Cell>(); | |
StreamReader sr = new StreamReader(Settings.inpath); | |
sr.ReadLine(); // Skip Header | |
while (sr.Peek() > -1) | |
{ | |
string line = sr.ReadLine(); | |
Cell c = new Cell(count, line); | |
count++; | |
cells.Add(c); | |
} | |
return cells.ToArray(); | |
} | |
// 初期化 Step2. ロケーション変数の調整 | |
private void initializeCells2(Cell[] cells) | |
{ | |
double[] weights = Settings.weights; | |
for (int i = 0; i < cells.Count(); i++) | |
{ | |
cells[i].AdjustedLocationParams = new double[cells[i].LocationParams.Length]; | |
} | |
for (int k = 0; k < weights.Count(); k++) | |
{ | |
// 各パラメータの分散を求める | |
double[] sizes = cells.Select(c => c.OriginalSize).ToArray(); | |
double[] paramvals = cells.Select(c => c.LocationParams[k]).ToArray(); | |
double sizesum = sizes.Sum(); | |
double avg = 0.0; | |
for (int n = 0; n < sizes.Count(); n++) | |
{ | |
avg += sizes[n] * paramvals[n] / sizesum; | |
} | |
double std = 0.0; | |
for (int n = 0; n < sizes.Count(); n++) | |
{ | |
std += sizes[n] * Math.Pow((paramvals[n] - avg), 2) / sizesum; | |
} | |
std = Math.Sqrt(std); | |
double weight = weights[k]; | |
// 正規化し、ウェイトを乗じる | |
double[] stdparamvals = paramvals.Select(v => weight * v / std).ToArray(); | |
// 各セルに調整後の値をセットする | |
for (int i = 0; i < cells.Count(); i++) | |
{ | |
cells[i].AdjustedLocationParams[k] = stdparamvals[i]; | |
} | |
} | |
} | |
// Step3. 結果変数の初期化 | |
private void initializeCells3(Cell[] cells) | |
{ | |
for (int i = 0; i < cells.Count(); i++) | |
{ | |
cells[i].Size = cells[i].OriginalSize; | |
cells[i].friends = new List<Cell> { cells[i] }; | |
} | |
} | |
// クラスタリングを行う | |
private void runClustering(Cell[] cells) | |
{ | |
int cellcount = cells.Count(); | |
int endcells = Settings.endcells; | |
// 有効な組み合わせに対して重要度を計算する。 | |
List<Pair> pairs = new List<Pair>(); | |
for(int i = 0; i < cellcount; i++) | |
for (int j = 0; j < cellcount; j++) | |
{ | |
Cell from = cells[i]; | |
Cell to = cells[j]; | |
if (i != j && from.Segment == to.Segment){ | |
Pair p = new Pair { FromID = i, ToID = j, Distance = Cell.Distance(from, to), FromSize = from.Size }; | |
p.Importance = p.FromSize * p.Distance; | |
pairs.Add(p); | |
} | |
} | |
// 計算量を抑えるため、プライオリティーキューを利用する。 | |
IPriorityQueue<Pair> queue = new IntervalHeap<Pair>(new PairComparer()); | |
queue.AddAll(pairs); | |
while(true){ | |
// 最小のものを取り出す | |
var e = queue.DeleteMin(); | |
Cell from = cells[e.FromID]; | |
Cell to = cells[e.ToID]; | |
// どちらかが生きていない場合、次へ移る | |
if (from.IsDead() || to.IsDead()) continue; | |
// 更新の必要がある場合、更新して次へ移る | |
if (e.FromSize != from.Size) | |
{ | |
Pair p = new Pair { FromID = e.FromID, ToID = e.ToID, FromSize = from.Size, Distance = e.Distance }; | |
p.Importance = p.FromSize * p.Distance; | |
queue.Add(p); | |
continue; | |
} | |
// 利用可能な場合、それを採用する | |
// セルを統合する | |
to.Size += from.Size; | |
from.Size = 0.0; | |
to.friends.AddRange(from.friends); | |
from.friends = null; | |
cellcount--; | |
if (cellcount <= endcells) break; | |
} | |
} | |
// クラスタの中心に最も近いセルを代表セルとする | |
private void setRepresentCell(Cell[] cells) | |
{ | |
// クラスタごとに分ける | |
Dictionary<int, List<Cell>> clusters = new Dictionary<int, List<Cell>>(); | |
for (int i = 0; i < cells.Count(); i++) | |
{ | |
Cell cell = cells[i]; | |
if(!cell.IsDead()){ | |
clusters.Add(cell.ID, cell.friends); | |
} | |
} | |
foreach (int dest in clusters.Keys) | |
{ | |
// セルのサイズと中心を求める | |
Cell[] cls = clusters[dest].ToArray(); | |
var sizeAndCenter = getSizeAndCenter(cls); | |
double size = sizeAndCenter.Item1; | |
double[] centerParams = sizeAndCenter.Item2; | |
// 最も近いセルを求める | |
int rep_id = nearestID(centerParams, cls); | |
for (int i = 0; i < cls.Count(); i++) | |
{ | |
Cell cell = cls[i]; | |
if (rep_id == cell.ID) | |
{ | |
cell.IsRep = true; | |
cell.RepSize = size; | |
} | |
else | |
{ | |
cell.IsRep = false; | |
cell.RepSize = 0.0; | |
} | |
} | |
} | |
} | |
// セルのサイズと中心を求める | |
private Tuple<double, double[]> getSizeAndCenter(Cell[] cells) | |
{ | |
double size = 0.0; | |
double[] _params = new double[Settings.paramLength]; | |
for (int i = 0; i < cells.Count(); i++) | |
{ | |
Cell cell = cells[i]; | |
size += cell.Size; | |
for (int j = 0; j < _params.Length; j++) | |
{ | |
_params[j] += cell.AdjustedLocationParams[j] * cell.Size; | |
} | |
} | |
for (int j = 0; j < _params.Length; j++) | |
{ | |
_params[j] /= size; | |
} | |
return Tuple.Create(size, _params); | |
} | |
// 最も近いセルを求める | |
private int nearestID(double[] centerParams, Cell[] cells) | |
{ | |
int ret = -1; | |
double nearest = -1; | |
for (int i = 0; i < cells.Count(); i++) | |
{ | |
Cell cell = cells[i]; | |
double dist = 0; | |
for (int j = 0; j < centerParams.Length; j++) | |
{ | |
dist += Math.Pow((cell.AdjustedLocationParams[j] - centerParams[j]), 2); | |
} | |
dist = Math.Sqrt(dist); | |
if (ret == -1 || dist < nearest) | |
{ | |
ret = cell.ID; | |
nearest = dist; | |
} | |
} | |
return ret; | |
} | |
// クラスタリング結果をテキストに出力する | |
private void WriteClusteredCells(Cell[] cells) | |
{ | |
StreamWriter sw = new StreamWriter(Settings.outpath); | |
sw.WriteLine("segment\tsize\tparams"); | |
for (int i = 0; i < cells.Count(); i++) | |
{ | |
Cell cell = cells[i]; | |
if (cell.IsRep) | |
{ | |
sw.WriteLine("{0}\t{1}\t{2}", cell.Segment, cell.RepSize, string.Join("\t", cell.LocationParams)); | |
} | |
} | |
sw.Close(); | |
} | |
} | |
// ソート用の比較基準 | |
class PairComparer : System.Collections.Generic.Comparer<Pair> | |
{ | |
public override int Compare(Pair x, Pair y) | |
{ | |
// null, 型チェックは省略 | |
return ((Pair)x).Importance.CompareTo(((Pair)y).Importance); | |
} | |
} | |
// セルの組を表すクラス | |
class Pair | |
{ | |
public int FromID; | |
public int ToID; | |
public double FromSize; | |
public double Distance; | |
public double Importance; | |
} | |
// セルを表すクラス | |
class Cell | |
{ | |
// インプット | |
public int ID; | |
public int Segment; | |
public double OriginalSize; | |
public double[] LocationParams; | |
public Cell(int ID, string line) | |
{ | |
string[] items = line.Split('\t'); | |
this.ID = ID; | |
this.Segment = int.Parse(items[0]); | |
this.OriginalSize = double.Parse(items[1]); | |
this.LocationParams = items.Skip(2).Select(e => double.Parse(e)).ToArray(); | |
} | |
// 中間値 | |
public double[] AdjustedLocationParams; // 正規化・ウェイト後 | |
// 2つのセルの距離の定義 | |
public static double Distance(Cell c1, Cell c2){ | |
int L = c1.LocationParams.Count(); | |
double ret = 0.0; | |
for (int i = 0; i < L; i++) | |
{ | |
ret += Math.Pow((c1.AdjustedLocationParams[i] - c2.AdjustedLocationParams[i]), 2.0); | |
} | |
return Math.Sqrt(ret); | |
} | |
// クラスタリング結果 | |
public double Size; | |
public List<Cell> friends; // 同じクラスタにいるセルのリスト、自分も含む | |
public bool IsDead() | |
{ | |
return friends == null; | |
} | |
// 代表セル設定後の結果 | |
public double RepSize; | |
public bool IsRep; | |
// ToString | |
public override string ToString() | |
{ | |
return string.Format("ID:{0},Seg:{1},OSize:{2},LParams:{3},ALParams:{4}", ID, Segment, OriginalSize, string.Join(",", LocationParams), string.Join(",", AdjustedLocationParams)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment