Skip to content

Instantly share code, notes, and snippets.

@tomonari-masada
Last active January 16, 2023 17:33
Show Gist options
  • Save tomonari-masada/5d68b5002560d309b057 to your computer and use it in GitHub Desktop.
Save tomonari-masada/5d68b5002560d309b057 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
#include <math.h>
#include <omp.h>
#define real float
#define BUFFSIZE 10000
#define TOKENLEN 1000
#define HASH_SIZE 131073
#define NUM_OF_THREADS 16
#define BORDER "<S>"
#define LAYER_NUM 3
#define VEC_DIM 100
#define SGM_THRESHOLD 15.0
#define TANH_THRESHOLD 8.0
#define TABLE_SIZE 50001
#define LEARNING_RATE 0.02
unsigned int nNumOfWords;
char **sWords;
unsigned int nBorder;
real dLearningRate;
unsigned long long nNumOfDocs;
unsigned int *nDocLen;
unsigned int **nWordSeq;
unsigned int nMaxDocLen;
unsigned int nIteration;
unsigned long long nNumOfSamples;
real dTotalLength;
real dDiffTime;
real dSgmTableFactor;
real *dSgmTable;
real dTanhTableFactor;
real *dTanhTable;
FILE *fOut;
unsigned int nLayerNum;
unsigned int *nVecDim;
unsigned int nLen;
unsigned int *nWSizeI;
unsigned int *nWSizeP;
unsigned int *nWSizeE;
unsigned int *nWSizeO;
real **dI;
real **dP;
real **dE;
real **dO;
real **dH;
real **dAI;
real **dAP;
real **dAE;
real **dAO;
real *dY;
real *dYHat;
real ***dWI;
real ***dWP;
real ***dWE;
real ***dWO;
real **dWOut;
real *dBiasOut;
real **dGH;
real **dGE;
real **dGAI;
real **dGAP;
real **dGAE;
real **dGAO;
real *dGYHat;
real ***dGWI;
real ***dGWP;
real ***dGWE;
real ***dGWO;
real **dGWOut;
real *dGBiasOut;
//----------------------------------------------------------------
time_t start_time;
void put_time(FILE *f)
{
time_t current_time;
time(&current_time);
dDiffTime = difftime(current_time, start_time);
fprintf(f, "%.0lf", dDiffTime);
fflush(f);
}
real randn()
{ return ((real) (((double) rand()) / RAND_MAX)) * 2.0 - 1.0; }
//----------------------------------------------------------------
// hash function by Paul Hsieh
// http://www.azillionmonkeys.com/qed/hash.html
#undef get16bits
#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
|| defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
#define get16bits(d) (*((const uint16_t *) (d)))
#endif
#if !defined (get16bits)
#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \
+(uint32_t)(((const uint8_t *)(d))[0]) )
#endif
unsigned int SuperFastHash (const char * data, int len) {
uint32_t hash = len, tmp; int rem;
if (len <= 0 || data == NULL) return 0;
rem = len & 3; len >>= 2;
for (;len > 0; len--) {
hash += get16bits (data);
tmp = (get16bits (data+2) << 11) ^ hash;
hash = (hash << 16) ^ tmp;
data += 2*sizeof (uint16_t);
hash += hash >> 11;
}
switch (rem) {
case 3: hash += get16bits (data);
hash ^= hash << 16;
hash ^= ((signed char)data[sizeof (uint16_t)]) << 18;
hash += hash >> 11;
break;
case 2: hash += get16bits (data);
hash ^= hash << 11; hash += hash >> 17;
break;
case 1: hash += (signed char)*data;
hash ^= hash << 10; hash += hash >> 1;
}
hash ^= hash << 3; hash += hash >> 5; hash ^= hash << 4;
hash += hash >> 17; hash ^= hash << 25; hash += hash >> 6;
return ((unsigned int) hash) % HASH_SIZE; // by me
}
//----------------------------------------------------------------
// hash table for words
unsigned int *nWordNums;
unsigned int **nWordIDs;
int initHash()
{
unsigned int n;
nNumOfWords = 0;
nWordNums = (unsigned int *) malloc(sizeof(unsigned int) * HASH_SIZE);
nWordIDs = (unsigned int **) malloc(sizeof(unsigned int *) * HASH_SIZE);
if ((nWordNums == NULL) || (nWordIDs == NULL)) return 1;
for (n = 0; n < HASH_SIZE; n ++) nWordNums[n] = 0;
return 0;
}
unsigned int getWordID(const char *s)
{
unsigned int n, nID, nHashKey;
nID = 0;
nHashKey = SuperFastHash(s, 100);
for (n = 0; n < nWordNums[nHashKey]; n ++)
if (strcmp(s, sWords[nWordIDs[nHashKey][n] - 1]) == 0) {
nID = nWordIDs[nHashKey][n];
break;
}
return nID;
}
unsigned int addWord(const char *s)
{
unsigned int n, nID, nHashKey;
nID = 0;
nHashKey = SuperFastHash(s, 100);
for (n = 0; n < nWordNums[nHashKey]; n ++)
if (strcmp(s, sWords[nWordIDs[nHashKey][n] - 1]) == 0) {
nID = nWordIDs[nHashKey][n];
break;
}
if (nID == 0) {
nNumOfWords ++;
nWordNums[nHashKey] ++;
if (nWordNums[nHashKey] == 1)
nWordIDs[nHashKey] = (unsigned int *) malloc(sizeof(unsigned int));
else
nWordIDs[nHashKey]
= (unsigned int *) realloc(nWordIDs[nHashKey],
sizeof(unsigned int) * nWordNums[nHashKey]);
nWordIDs[nHashKey][nWordNums[nHashKey] - 1] = nNumOfWords;
if (nNumOfWords == 1)
sWords = (char **) malloc(sizeof(char *));
else
sWords = (char **) realloc(sWords, sizeof(char *) * nNumOfWords);
sWords[nNumOfWords - 1] = (char *) malloc(sizeof(char) * (strlen(s) + 1));
memset(sWords[nNumOfWords - 1], 0, strlen(s) + 1);
strcpy(sWords[nNumOfWords - 1], s);
nID = nNumOfWords;
}
return nID;
}
//----------------------------------------------------------------
real sgm(real dA)
{
if (dA <= - SGM_THRESHOLD) return 0.0;
else if (dA >= SGM_THRESHOLD) return 1.0;
else return dSgmTable[((int) ((SGM_THRESHOLD + dA) * dSgmTableFactor))];
}
real _tanh(real dA)
{
if (dA <= - TANH_THRESHOLD) return - 1.0;
else if (dA >= TANH_THRESHOLD) return 1.0;
else return dTanhTable[((int) ((TANH_THRESHOLD + dA) * dTanhTableFactor))];
}
void initParam()
{
unsigned int nCK, nCN, nCC, nC;
for (nCN = 0; nCN < nLayerNum; nCN ++)
for (nCC = 0; nCC < nNumOfWords * nVecDim[nCN]; nCC ++)
dWOut[nCN][nCC] = 0.5 * randn();
for (nCK = 0; nCK < nNumOfWords; nCK ++)
dBiasOut[nCK] = 0.5 * randn();
for (nCN = 0; nCN < nLayerNum; nCN ++)
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {
for (nC = 0; nC < nWSizeI[nCN]; nC ++)
dWI[nCN][nCC][nC] = 0.5 * randn();
for (nC = 0; nC < nWSizeP[nCN]; nC ++)
dWP[nCN][nCC][nC] = 0.5 * randn();
dWP[nCN][nCC][nWSizeP[nCN] - 1] = 1.0e10;
for (nC = 0; nC < nWSizeE[nCN]; nC ++)
dWE[nCN][nCC][nC] = 0.5 * randn();
for (nC = 0; nC < nWSizeO[nCN]; nC ++)
dWO[nCN][nCC][nC] = 0.5 * randn();
}
return;
}
void printParam()
{
unsigned int nCK, nCN, nCC, nC;
for (nCK = 0; nCK < nNumOfWords; nCK ++)
fprintf(fOut, "#w %d %s\n", nCK, sWords[nCK]);
for (nCN = 0; nCN < nLayerNum; nCN ++)
for (nCK = 0; nCK < nNumOfWords; nCK ++) {
fprintf(fOut, "#W %d %s [", nCN, sWords[nCK]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {
fprintf(fOut, "%f", dWOut[nCN][nCK * nVecDim[nCN] + nCC]);
if (nCC < nVecDim[nCN] - 1) fprintf(fOut, ",");
}
fprintf(fOut, "]\n");
}
for (nCK = 0; nCK < nNumOfWords; nCK ++)
fprintf(fOut, "#B %s %f\n", sWords[nCK], dBiasOut[nCK]);
for (nCN = 0; nCN < nLayerNum; nCN ++) {
for (nC = 0; nC < nWSizeI[nCN]; nC ++) {
fprintf(fOut, "#I %d %d [", nCN, nC);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {
fprintf(fOut, "%f", dWI[nCN][nCC][nC]);
if (nCC < nVecDim[nCN] - 1) fprintf(fOut, ",");
}
fprintf(fOut, "]\n");
}
for (nC = 0; nC < nWSizeP[nCN]; nC ++) {
fprintf(fOut, "#P %d %d [", nCN, nC);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {
fprintf(fOut, "%f", dWP[nCN][nCC][nC]);
if (nCC < nVecDim[nCN] - 1) fprintf(fOut, ",");
}
fprintf(fOut, "]\n");
}
for (nC = 0; nC < nWSizeE[nCN]; nC ++) {
fprintf(fOut, "#E %d %d [", nCN, nC);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {
fprintf(fOut, "%f", dWE[nCN][nCC][nC]);
if (nCC < nVecDim[nCN] - 1) fprintf(fOut, ",");
}
fprintf(fOut, "]\n");
}
for (nC = 0; nC < nWSizeO[nCN]; nC ++) {
fprintf(fOut, "#O %d %d [", nCN, nC);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++) {
fprintf(fOut, "%f", dWO[nCN][nCC][nC]);
if (nCC < nVecDim[nCN] - 1) fprintf(fOut, ",");
}
fprintf(fOut, "]\n");
}
}
return;
}
real forward(unsigned long long nCD)
{
real dLogLH;
unsigned int nLen;
unsigned int nCT, nCK, nCN, nCC, nTemp;
real dTemp;
dLogLH = 0.0;
nLen = nDocLen[nCD] + 1;
for (nCT = 0; nCT < nLen; nCT ++){
if (nCT > 0) nCK = nWordSeq[nCD][nCT - 1];
else nCK = nBorder;
for (nCN = 0; nCN < nLayerNum; nCN ++) {
#pragma omp parallel
{
unsigned nCC, nC;
unsigned int nTempW, nTemp;
real dTempI, dTempP, dTempE, dTempO, dTemp;
#ifdef _OPENMP
unsigned int nThreadID = omp_get_thread_num();
unsigned int nNumOfThreads = omp_get_num_threads();
#else
unsigned int nThreadID = 0;
unsigned int nNumOfThreads = 1;
#endif
for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) {
dTempI = dWI[nCN][nCC][nCK];
dTempP = dWP[nCN][nCC][nCK];
dTempE = dWE[nCN][nCC][nCK];
dTempO = dWO[nCN][nCC][nCK];
nTempW = nNumOfWords;
if (nCN > 0) {
for (nC = 0; nC < nVecDim[nCN - 1]; nC ++) {
dTemp = dH[nCN - 1][nVecDim[nCN - 1] * nCT + nC];
dTempI += dWI[nCN][nCC][nTempW + nC] * dTemp;
dTempP += dWP[nCN][nCC][nTempW + nC] * dTemp;
dTempE += dWE[nCN][nCC][nTempW + nC] * dTemp;
dTempO += dWO[nCN][nCC][nTempW + nC] * dTemp;
}
nTempW += nVecDim[nCN - 1];
}
if (nCT > 0)
for (nC = 0; nC < nVecDim[nCN]; nC ++) {
dTemp = dH[nCN][nVecDim[nCN] * (nCT - 1) + nC];
dTempI += dWI[nCN][nCC][nTempW + nC] * dTemp;
dTempP += dWP[nCN][nCC][nTempW + nC] * dTemp;
dTempE += dWE[nCN][nCC][nTempW + nC] * dTemp;
dTempO += dWO[nCN][nCC][nTempW + nC] * dTemp;
}
nTempW += nVecDim[nCN];
if (nCT > 0) {
dTemp = dE[nCN][nVecDim[nCN] * (nCT - 1) + nCC];
dTempI += dWI[nCN][nCC][nTempW] * dTemp;
dTempP += dWP[nCN][nCC][nTempW] * dTemp;
}
dTempI += dWI[nCN][nCC][nTempW + 1];
dTempP += dWP[nCN][nCC][nTempW + 1];
dTempE += dWE[nCN][nCC][nTempW];
dTempO += dWO[nCN][nCC][nTempW + 1];
nTemp = nVecDim[nCN] * nCT + nCC;
dAI[nCN][nTemp] = dTempI;
dI[nCN][nTemp] = sgm(dTempI);
dAP[nCN][nTemp] = dTempP;
dP[nCN][nTemp] = sgm(dTempP);
dAE[nCN][nTemp] = dTempE;
dE[nCN][nTemp] = dI[nCN][nTemp] * _tanh(dAE[nCN][nTemp]);
if (nCT > 0)
dE[nCN][nTemp] += dP[nCN][nTemp] * dE[nCN][nVecDim[nCN] * (nCT - 1) + nCC];
dTempO += dWO[nCN][nCC][nTempW] * dE[nCN][nTemp];
dAO[nCN][nTemp] = dTempO;
dO[nCN][nTemp] = sgm(dTempO);
dH[nCN][nTemp] = dO[nCN][nTemp] * _tanh(dE[nCN][nTemp]);
}
}
}
nTemp = nCT * nNumOfWords;
for (nCK = 0; nCK < nNumOfWords; nCK ++) {
dTemp = dBiasOut[nCK];
for (nCN = 0; nCN < nLayerNum; nCN ++)
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dTemp += dWOut[nCN][nVecDim[nCN] * nCK + nCC]
* dH[nCN][nVecDim[nCN] * nCT + nCC];
dYHat[nTemp + nCK] = dTemp;
}
dTemp = dYHat[nTemp];
for (nCK = 1; nCK < nNumOfWords; nCK ++)
if (dTemp < dYHat[nTemp + nCK]) dTemp = dYHat[nTemp + nCK];
for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] = dYHat[nTemp + nCK] - dTemp;
for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] = exp(dY[nTemp + nCK]);
dTemp = 0.0;
for (nCK = 0; nCK < nNumOfWords; nCK ++) dTemp += dY[nTemp + nCK];
for (nCK = 0; nCK < nNumOfWords; nCK ++) dY[nTemp + nCK] /= dTemp;
if (nCT == nLen - 1) dLogLH += log(dY[nTemp + nBorder]);
else dLogLH += log(dY[nTemp + nWordSeq[nCD][nCT]]);
}
return dLogLH;
}
void backward(unsigned long long nCD)
{
unsigned int nLen;
unsigned int nCT, nCK, nCN, nCC;
unsigned int nTemp;
nLen = nDocLen[nCD] + 1;
nCT = nLen;
do {
nCT --;
nTemp = nCT * nNumOfWords;
for (nCK = 0; nCK < nNumOfWords; nCK ++)
dGYHat[nTemp + nCK] = - dY[nTemp + nCK];
if (nCT < nLen - 1) dGYHat[nTemp + nWordSeq[nCD][nCT]] += 1.0;
else dGYHat[nTemp + nBorder] += 1.0;
} while (nCT > 0);
for (nCK = 0; nCK < nNumOfWords; nCK ++)
dGBiasOut[nCK] = 0.0;
for (nCN = 0; nCN < nLayerNum; nCN ++)
for (nCC = 0; nCC < nNumOfWords * nVecDim[nCN]; nCC ++)
dGWOut[nCN][nCC] = 0.0;
nCT = nLen;
do {
nCT --;
nTemp = nCT * nNumOfWords;
for (nCK = 0; nCK < nNumOfWords; nCK ++)
dGBiasOut[nCK] += dGYHat[nTemp + nCK];
nCN = nLayerNum;
do {
nCN --;
#pragma omp parallel
{
unsigned int nCK, nCNN, nCC, nC;
unsigned int nTemp, nTemp2, nTemp3;
real dTemp, dTemp2;
#ifdef _OPENMP
unsigned int nThreadID = omp_get_thread_num();
unsigned int nNumOfThreads = omp_get_num_threads();
#else
unsigned int nThreadID = 0;
unsigned int nNumOfThreads = 1;
#endif
nTemp = nCT * nNumOfWords;
for (nCK = nThreadID; nCK < nNumOfWords; nCK += nNumOfThreads)
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dGWOut[nCN][nVecDim[nCN] * nCK + nCC]
+= dH[nCN][nVecDim[nCN] * nCT + nCC] * dGYHat[nTemp + nCK];
for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) {
nTemp = nVecDim[nCN] * nCT + nCC;
nTemp2 = nCT * nNumOfWords;
dGH[nCN][nTemp] = 0.0;
for (nCK = 0; nCK < nNumOfWords; nCK ++)
dGH[nCN][nTemp] += dWOut[nCN][nVecDim[nCN] * nCK + nCC]
* dGYHat[nTemp2 + nCK];
if (nCN < nLayerNum - 1) {
nCNN = nCN + 1;
nTemp2 = nCT * nVecDim[nCNN];
nTemp3 = nNumOfWords + nCC;
for (nC = 0; nC < nVecDim[nCNN]; nC ++) {
dGH[nCN][nTemp] += dWI[nCNN][nC][nTemp3] * dGAI[nCNN][nTemp2 + nC];
dGH[nCN][nTemp] += dWP[nCNN][nC][nTemp3] * dGAP[nCNN][nTemp2 + nC];
dGH[nCN][nTemp] += dWE[nCNN][nC][nTemp3] * dGAE[nCNN][nTemp2 + nC];
dGH[nCN][nTemp] += dWO[nCNN][nC][nTemp3] * dGAO[nCNN][nTemp2 + nC];
}
}
if (nCT < nLen - 1) {
nTemp2 = (nCT + 1) * nVecDim[nCN];
nTemp3 = nNumOfWords + nCC;
for (nC = 0; nC < nVecDim[nCN]; nC ++) {
if (nCN > 0) nTemp3 += nVecDim[nCN - 1];
dGH[nCN][nTemp] += dWI[nCN][nC][nTemp3] * dGAI[nCN][nTemp2 + nC];
dGH[nCN][nTemp] += dWP[nCN][nC][nTemp3] * dGAP[nCN][nTemp2 + nC];
dGH[nCN][nTemp] += dWE[nCN][nC][nTemp3] * dGAE[nCN][nTemp2 + nC];
dGH[nCN][nTemp] += dWO[nCN][nC][nTemp3] * dGAO[nCN][nTemp2 + nC];
}
}
dTemp = _tanh(dE[nCN][nTemp]);
dGAO[nCN][nTemp] = dGH[nCN][nTemp]
* dTemp * dO[nCN][nTemp] * (1.0 - dO[nCN][nTemp]);
nTemp2 = nNumOfWords + nVecDim[nCN];
if (nCN > 0) nTemp2 += nVecDim[nCN - 1];
dGE[nCN][nTemp] = dWO[nCN][nCC][nTemp2] * dGAO[nCN][nTemp];
dGE[nCN][nTemp] += dO[nCN][nTemp] * (1.0 - dTemp * dTemp) * dGH[nCN][nTemp];
if (nCT < nLen - 1) {
nTemp3 = nVecDim[nCN] * (nCT + 1) + nCC;
dGE[nCN][nTemp] += dP[nCN][nTemp3] * dGE[nCN][nTemp3];
dGE[nCN][nTemp] += dWP[nCN][nCC][nTemp2] * dGAP[nCN][nTemp3];
dGE[nCN][nTemp] += dWI[nCN][nCC][nTemp2] * dGAI[nCN][nTemp3];
}
dTemp2 = _tanh(dAE[nCN][nTemp]);
dGAE[nCN][nTemp] = dI[nCN][nTemp] * (1.0 - dTemp2 * dTemp2) * dGE[nCN][nTemp];
dGAP[nCN][nTemp] = 0.0;
if (nCT > 0)
dGAP[nCN][nTemp] += dP[nCN][nTemp] * (1.0 - dP[nCN][nTemp])
* dE[nCN][nVecDim[nCN] * (nCT - 1) + nCC] * dGE[nCN][nTemp];
dGAI[nCN][nTemp] = dI[nCN][nTemp] * (1.0 - dI[nCN][nTemp]) * dTemp * dGE[nCN][nTemp];
}
}
} while (nCN > 0);
} while (nCT > 0);
return;
}
void weight(unsigned long long nCD)
{
unsigned int nLen;
unsigned int nCN, nCNN;
nLen = nDocLen[nCD] + 1;
for (nCN = 0; nCN < nLayerNum; nCN ++) {
nCNN = 0;
if (nCN > 0) nCNN = nCN - 1;
#pragma omp parallel
{
unsigned int nCT, nCK, nCC, nC;
unsigned int nTemp, nTemp2, nTempW;
real dTempI, dTempP, dTempE, dTempO;
#ifdef _OPENMP
unsigned int nThreadID = omp_get_thread_num();
unsigned int nNumOfThreads = omp_get_num_threads();
#else
unsigned int nThreadID = 0;
unsigned int nNumOfThreads = 1;
#endif
for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) {
for (nC = 0; nC < nWSizeI[nCN]; nC ++)
dGWI[nCN][nCC][nC] = 0.0;
for (nC = 0; nC < nWSizeP[nCN]; nC ++)
dGWP[nCN][nCC][nC] = 0.0;
for (nC = 0; nC < nWSizeE[nCN]; nC ++)
dGWE[nCN][nCC][nC] = 0.0;
for (nC = 0; nC < nWSizeO[nCN]; nC ++)
dGWO[nCN][nCC][nC] = 0.0;
for (nCT = 0; nCT < nLen; nCT ++) {
nTemp = nCT * nVecDim[nCN] + nCC;
dTempI = dGAI[nCN][nTemp];
dTempP = dGAP[nCN][nTemp];
dTempE = dGAE[nCN][nTemp];
dTempO = dGAO[nCN][nTemp];
if (nCT > 0) nCK = nWordSeq[nCD][nCT - 1];
else nCK = nBorder;
dGWI[nCN][nCC][nCK] += dTempI;
dGWP[nCN][nCC][nCK] += dTempP;
dGWE[nCN][nCC][nCK] += dTempE;
dGWO[nCN][nCC][nCK] += dTempO;
nTempW = nNumOfWords;
if (nCN > 0) {
nTemp2 = nCT * nVecDim[nCNN];
for (nC = 0; nC < nVecDim[nCNN]; nC ++) {
dGWI[nCN][nCC][nTempW + nC] += dTempI * dH[nCNN][nTemp2 + nC];
dGWP[nCN][nCC][nTempW + nC] += dTempP * dH[nCNN][nTemp2 + nC];
dGWE[nCN][nCC][nTempW + nC] += dTempE * dH[nCNN][nTemp2 + nC];
dGWO[nCN][nCC][nTempW + nC] += dTempO * dH[nCNN][nTemp2 + nC];
}
nTempW += nVecDim[nCNN];
}
if (nCT > 0) {
nTemp2 = (nCT - 1) * nVecDim[nCN];
for (nC = 0; nC < nVecDim[nCN]; nC ++) {
dGWI[nCN][nCC][nTempW + nC] += dTempI * dH[nCN][nTemp2 + nC];
dGWP[nCN][nCC][nTempW + nC] += dTempP * dH[nCN][nTemp2 + nC];
dGWE[nCN][nCC][nTempW + nC] += dTempE * dH[nCN][nTemp2 + nC];
dGWO[nCN][nCC][nTempW + nC] += dTempO * dH[nCN][nTemp2 + nC];
}
}
nTempW += nVecDim[nCN];
if (nCT > 0) {
nTemp2 = (nCT - 1) * nVecDim[nCN];
dGWI[nCN][nCC][nTempW] += dTempI * dE[nCN][nTemp2 + nCC];
dGWP[nCN][nCC][nTempW] += dTempP * dE[nCN][nTemp2 + nCC];
}
dGWO[nCN][nCC][nTempW] += dTempO * dE[nCN][nCT * nVecDim[nCN] + nCC];
dGWI[nCN][nCC][nTempW + 1] += dTempI;
dGWP[nCN][nCC][nTempW + 1] += dTempP;
dGWE[nCN][nCC][nTempW] += dTempE;
dGWO[nCN][nCC][nTempW + 1] += dTempO;
}
}
}
}
#pragma omp parallel
{
unsigned int nCN, nCK, nCC, nC;
real dTemp;
#ifdef _OPENMP
unsigned int nThreadID = omp_get_thread_num();
unsigned int nNumOfThreads = omp_get_num_threads();
#else
unsigned int nThreadID = 0;
unsigned int nNumOfThreads = 1;
#endif
for (nCN = 0; nCN < nLayerNum; nCN ++) {
for (nCC = nThreadID; nCC < nVecDim[nCN]; nCC += nNumOfThreads) {
for (nC = 0; nC < nWSizeI[nCN]; nC ++) {
dTemp = dWI[nCN][nCC][nC] + dLearningRate * dGWI[nCN][nCC][nC];
if (isfinite(dTemp)) dWI[nCN][nCC][nC] = dTemp;
}
for (nC = 0; nC < nWSizeP[nCN]; nC ++) {
dTemp = dWP[nCN][nCC][nC] + dLearningRate * dGWP[nCN][nCC][nC];
if (isfinite(dTemp)) dWP[nCN][nCC][nC] = dTemp;
}
for (nC = 0; nC < nWSizeE[nCN]; nC ++) {
dTemp = dWE[nCN][nCC][nC] + dLearningRate * dGWE[nCN][nCC][nC];
if (isfinite(dTemp)) dWE[nCN][nCC][nC] = dTemp;
}
for (nC = 0; nC < nWSizeO[nCN]; nC ++) {
dTemp = dWO[nCN][nCC][nC] + dLearningRate * dGWO[nCN][nCC][nC];
if (isfinite(dTemp)) dWO[nCN][nCC][nC] = dTemp;
}
}
for (nCC = nThreadID; nCC < nNumOfWords * nVecDim[nCN]; nCC += nNumOfThreads) {
dTemp = dWOut[nCN][nCC] + dLearningRate * dGWOut[nCN][nCC];
if (isfinite(dTemp)) dWOut[nCN][nCC] = dTemp;
}
}
for (nCK = nThreadID; nCK < nNumOfWords; nCK += nNumOfThreads) {
dTemp = dBiasOut[nCK] + dLearningRate * dGBiasOut[nCK];
if (isfinite(dTemp)) dBiasOut[nCK] = dTemp;
}
}
return;
}
//--------
int main(int argc, char **argv)
{
char sBuff[BUFFSIZE];
unsigned int nWordID;
char sTempWord[TOKENLEN], sTempDocName[TOKENLEN], sPrevDocName[TOKENLEN];
unsigned long long nCD;
unsigned int nCN, nCC;
real dLogLH;
real dTemp;
fOut = stdout;
if (initHash()) { fprintf(stderr, "%s : initHash()\n", argv[0]); exit(1); }
nNumOfDocs = 0;
nDocLen = (unsigned int *) malloc(sizeof(unsigned int));
nWordSeq = (unsigned int **) malloc(sizeof(unsigned int *));
memset(sPrevDocName, 0, TOKENLEN);
while (! feof(stdin)) {
memset(sBuff, 0, BUFFSIZE);
fgets(sBuff, BUFFSIZE - 1, stdin);
memset(sTempDocName, 0, TOKENLEN);
memset(sTempWord, 0, TOKENLEN);
if (sscanf(sBuff, "%s %s\n", sTempDocName, sTempWord) == 2) {
if (strcmp(sPrevDocName, sTempDocName) != 0) {
nNumOfDocs ++;
nDocLen = (unsigned int *) realloc(nDocLen, sizeof(unsigned int) * nNumOfDocs);
nWordSeq = (unsigned int **) realloc(nWordSeq, sizeof(unsigned int *) * nNumOfDocs);
nDocLen[nNumOfDocs - 1] = 0;
nWordSeq[nNumOfDocs - 1] = (unsigned int *) malloc(sizeof(unsigned int));
}
nWordID = addWord(sTempWord);
if (nWordID > 0) {
nDocLen[nNumOfDocs - 1] ++;
nWordSeq[nNumOfDocs - 1]
= (unsigned int *) realloc(nWordSeq[nNumOfDocs - 1],
sizeof(unsigned int) * nDocLen[nNumOfDocs - 1]);
nWordSeq[nNumOfDocs - 1][nDocLen[nNumOfDocs - 1] - 1] = nWordID - 1;
} else { fprintf(stderr, "Invalid data : %s\n", sTempWord); exit(1); }
}
memset(sPrevDocName, 0, TOKENLEN);
strcpy(sPrevDocName, sTempDocName);
}
if (nNumOfDocs == 0) { fprintf(stderr, "no valid data.\n"); exit(1); }
nBorder = addWord(BORDER);
nBorder --;
nMaxDocLen = 0;
for (nCD = 0; nCD < nNumOfDocs; nCD ++)
if (nMaxDocLen < nDocLen[nCD]) nMaxDocLen = nDocLen[nCD];
nMaxDocLen ++;
//----------------
dSgmTable = (real *) malloc(sizeof(real) * TABLE_SIZE);
dSgmTableFactor = ((real) TABLE_SIZE - 1) / (SGM_THRESHOLD * 2);
dTanhTable = (real *) malloc(sizeof(real) * TABLE_SIZE);
dTanhTableFactor = ((real) TABLE_SIZE - 1) / (TANH_THRESHOLD * 2);
for (nCC = 0; nCC < TABLE_SIZE; nCC ++) {
dTemp = exp(((real) nCC) / dSgmTableFactor - SGM_THRESHOLD);
dSgmTable[nCC] = dTemp / (1.0 + dTemp);
dTemp = exp(((real) nCC) / dTanhTableFactor - TANH_THRESHOLD);
dTanhTable[nCC] = (dTemp - 1.0 / dTemp) / (dTemp + 1.0 / dTemp);
}
//----------------
nLayerNum = LAYER_NUM;
nVecDim = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
dWOut = (real **) malloc(sizeof(real *) * nLayerNum);
dGWOut = (real **) malloc(sizeof(real *) * nLayerNum);
for (nCN = 0; nCN < nLayerNum; nCN ++) {
nVecDim[nCN] = VEC_DIM;
dWOut[nCN] = (real *) malloc(sizeof(real) * nNumOfWords * nVecDim[nCN]);
dGWOut[nCN] = (real *) malloc(sizeof(real) * nNumOfWords * nVecDim[nCN]);
}
dBiasOut = (real *) malloc(sizeof(real) * nNumOfWords);
dGBiasOut = (real *) malloc(sizeof(real) * nNumOfWords);
dY = (real *) malloc(sizeof(real) * nMaxDocLen * nNumOfWords);
dYHat = (real *) malloc(sizeof(real) * nMaxDocLen * nNumOfWords);
dGYHat = (real *) malloc(sizeof(real) * nMaxDocLen * nNumOfWords);
dI = (real **) malloc(sizeof(real *) * nLayerNum);
dP = (real **) malloc(sizeof(real *) * nLayerNum);
dE = (real **) malloc(sizeof(real *) * nLayerNum);
dO = (real **) malloc(sizeof(real *) * nLayerNum);
dH = (real **) malloc(sizeof(real *) * nLayerNum);
dAI = (real **) malloc(sizeof(real *) * nLayerNum);
dAP = (real **) malloc(sizeof(real *) * nLayerNum);
dAE = (real **) malloc(sizeof(real *) * nLayerNum);
dAO = (real **) malloc(sizeof(real *) * nLayerNum);
dGH = (real **) malloc(sizeof(real *) * nLayerNum);
dGE = (real **) malloc(sizeof(real *) * nLayerNum);
dGAI = (real **) malloc(sizeof(real *) * nLayerNum);
dGAP = (real **) malloc(sizeof(real *) * nLayerNum);
dGAE = (real **) malloc(sizeof(real *) * nLayerNum);
dGAO = (real **) malloc(sizeof(real *) * nLayerNum);
for (nCN = 0; nCN < nLayerNum; nCN ++) {
dI[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dP[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dE[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dO[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dH[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dAI[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dAP[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dAE[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dAO[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dGH[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dGE[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dGAI[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dGAP[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dGAE[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
dGAO[nCN] = (real *) malloc(sizeof(real) * nVecDim[nCN] * nMaxDocLen);
}
nWSizeI = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
nWSizeP = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
nWSizeE = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
nWSizeO = (unsigned int *) malloc(sizeof(unsigned int) * nLayerNum);
for (nCN = 0; nCN < nLayerNum; nCN ++) {
nWSizeI[nCN] = nNumOfWords + nVecDim[nCN] + 2;
if (nCN > 0) nWSizeI[nCN] += nVecDim[nCN - 1];
nWSizeP[nCN] = nNumOfWords + nVecDim[nCN] + 2;
if (nCN > 0) nWSizeP[nCN] += nVecDim[nCN - 1];
nWSizeE[nCN] = nNumOfWords + nVecDim[nCN] + 1;
if (nCN > 0) nWSizeE[nCN] += nVecDim[nCN - 1];
nWSizeO[nCN] = nNumOfWords + nVecDim[nCN] + 2;
if (nCN > 0) nWSizeO[nCN] += nVecDim[nCN - 1];
}
dWI = (real ***) malloc(sizeof(real **) * nLayerNum);
dWP = (real ***) malloc(sizeof(real **) * nLayerNum);
dWE = (real ***) malloc(sizeof(real **) * nLayerNum);
dWO = (real ***) malloc(sizeof(real **) * nLayerNum);
dGWI = (real ***) malloc(sizeof(real **) * nLayerNum);
dGWP = (real ***) malloc(sizeof(real **) * nLayerNum);
dGWE = (real ***) malloc(sizeof(real **) * nLayerNum);
dGWO = (real ***) malloc(sizeof(real **) * nLayerNum);
for (nCN = 0; nCN < nLayerNum; nCN ++) {
dWI[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dWI[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeI[nCN]);
dWP[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dWP[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeP[nCN]);
dWE[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dWE[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeE[nCN]);
dWO[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dWO[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeO[nCN]);
dGWI[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dGWI[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeI[nCN]);
dGWP[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dGWP[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeP[nCN]);
dGWE[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dGWE[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeE[nCN]);
dGWO[nCN] = (real **) malloc(sizeof(real *) * nVecDim[nCN]);
for (nCC = 0; nCC < nVecDim[nCN]; nCC ++)
dGWO[nCN][nCC] = (real *) malloc(sizeof(real) * nWSizeO[nCN]);
}
// print statistics
fprintf(fOut, "# %llu docs", nNumOfDocs);
fprintf(fOut, ", %u words", nNumOfWords);
for (nCN = 0; nCN < nLayerNum; nCN ++)
fprintf(fOut, ", %u units (%u)", nVecDim[nCN], nCN);
fprintf(fOut, "\n");
fflush(fOut);
initParam();
nIteration = 1;
dLearningRate = LEARNING_RATE;
time(&start_time);
while (1) {
nNumOfSamples = 0;
dLogLH = 0.0;
dTotalLength = 0.0;
for (nCD = 0; nCD < nNumOfDocs; nCD ++) {
dLogLH += forward(nCD);
backward(nCD);
weight(nCD);
nNumOfSamples ++;
dTotalLength += (real) (nDocLen[nCD] + 1);
if (nNumOfSamples % 100 == 0) {
fprintf(fOut, "## %d %llu ", nIteration, nNumOfSamples);
fprintf(fOut, "%.0f ", dTotalLength);
fprintf(fOut, "%.3f ", exp(- dLogLH / dTotalLength));
put_time(fOut);
fprintf(fOut, "\n");
fflush(fOut);
}
}
nIteration ++;
printParam();
}
exit(0);
}
@tomonari-masada
Copy link
Author

How to complie:
gcc -Wall -O3 -msse2 -DHAVE_SSE2 -o lstm_omp lstm_omp.c -lm -fopenmp -lgomp

@nobuyuki68
Copy link

素晴らしいですね。ボクもやっと多層パーセプトロンを実装したところです。エンジニアでなくても子供でもできるAIの教育ツールを作っています。そのためにGUIを強化し、マウス1つで操作できるように設計しています。画像の教師が終わったので、RNNやLSTMに移行し、時系列データーや文章データーを扱いたいと考えています。いろいろと参考にさせてください。ありがとうございます。よろしくお願いいたします。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment