Created
March 2, 2015 06:02
-
-
Save vanechu/cf60cd3f110d5ffdb58c to your computer and use it in GitHub Desktop.
Annotated Word2Vec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2013 Google Inc. All Rights Reserved. | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <math.h> | |
#include <pthread.h> | |
// Maximum Word Length | |
#define MAX_STRING 100 | |
// Expotenial Table Size, Provides 1000 Temporary Results | |
#define EXP_TABLE_SIZE 1000 | |
// From -6 (exp^-6 / (exp^-6 + 1)) to 6 (exp^6 / (exp^6 + 1)) | |
#define MAX_EXP 6 | |
// Maximum Sentence Length | |
#define MAX_SENTENCE_LENGTH 1000 | |
// Maximum Huffman Code Length | |
#define MAX_CODE_LENGTH 40 | |
// Hash, Linear Probing, Open Addressing, Fill Factor = 0.7 | |
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary | |
typedef float real; // Precision of float numbers | |
struct vocab_word { | |
long long cn; // Word Frequency | |
int *point; //Path From Root To The Word in Huffman Tree, Store Every Index of Non-Leaf Node | |
char *word, *code, codelen; //Word, Huffman Code, Code Length | |
// Definition of Training File, Ouput File | |
char train_file[MAX_STRING], output_file[MAX_STRING]; | |
// Definition of Output Vocabulary File, Reading Vocabulary File | |
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; | |
// Deinition of Vocabulary Struct | |
struct vocab_word *vocab; | |
// binary 0 -> vectors.bin Output Binary Stream; binary 1 -> Output Text Format | |
// cbow 1 -> Using Cbow, cbow 0 -> Skip-gram | |
// debug_mode > 0 -> Print Information When Loading Completed; debug_mode > 1 -> Print Information During Traning Process; | |
// window Size of Window. Indicate the Maximum scope of 'sum' in Cbow; 'max space between words(w1,w2,p(w1 | w2))' in Skip-gram | |
// min_count Number of Minimum Word Frequency | |
// num_threads Thread Number | |
// min_reduce Minimum Word Frequency in ReduceVocab, Becauce Hash Table is Limited | |
int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1; | |
int *vocab_hash;//Vocabulary Hash; Index: Hash Value; Content: Position of Word in Hash; a[word_hash] = word index in vocab | |
// vocab_max_size Maximum Vocabulary Size | |
// vocab_size Current Vocabulary Size, Extend 1000 per each time When Approximate vocab_max_size | |
// layer1_size Nodes Number in Hidden Layer | |
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; | |
// train_words Training Words Number(Accumulation of Word Frequency) | |
// word_count_actual Trained Words Number | |
// file_size Traning File Size, Get Through 'ftell' | |
// classes Number of Classes in Test Process | |
long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0; | |
// alpha Learning Rate in BP Algorithm, Adjust in Training Process | |
// starting_alpha Strating Alpha Value | |
// sample Parameter of Subsampling. Subsampling: Decline High-Frequency Word With Probability, Deafault = 0 -> No Subsampling | |
real alpha = 0.025, starting_alpha, sample = 0; | |
// syn0 Concatenate word vectors | |
// syn1 Weight from Hidden Layer to Non-Leaf Node in Huffman Tree in Hierarchical Softmax | |
// syn1neg Weight from Hidden Layer to Class in Negative Sampling | |
// expTable Exponential Cashing Table | |
real *syn0, *syn1, *syn1neg, *expTable; | |
// start Staring Time, Compute How Many Words Trained Per Second | |
clock_t start; | |
// hs Hierarchical Softmax and Negative Sampling Mark | |
int hs = 1, negative = 0; | |
// table_size | |
// Sampling Table Size | |
// table | |
// table Sampling Table | |
const int table_size = 1e8; | |
int *table; | |
// Generating Sampling Table According Word Frequency | |
void InitUnigramTable() { | |
int a, i; | |
long long train_words_pow = 0; | |
real d1, power = 0.75; //Probability is Proportional to Word Freq to the power of 'power' | |
table = (int *)malloc(table_size * sizeof(int)); | |
for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power); | |
i = 0; | |
d1 = pow(vocab[i].cn, power) / (real)train_words_pow;//Probability of First Word | |
for (a = 0; a < table_size; a++) { | |
table[a] = i; | |
if (a / (real)table_size > d1) { | |
i++; | |
d1 += pow(vocab[i].cn, power) / (real)train_words_pow; | |
} | |
if (i >= vocab_size) i = vocab_size - 1;//Process The Last One to Avoid Crossing Boundary | |
} | |
} | |
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries | |
void ReadWord(char *word, FILE *fin) { | |
int a = 0, ch; | |
while (!feof(fin)) { | |
ch = fgetc(fin); | |
if (ch == 13) continue; | |
// ASCII Value of 8,9,10 and 13 are 'Backspace', 'Tab', 'New Line' and 'Enter' | |
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { //Seperator of Word | |
if (ch == '\n') ungetc(ch, fin); //Push Back a Character to Input Stream | |
break; | |
} | |
if (ch == '\n') { | |
strcpy(word, (char *)"</s>"); | |
return; | |
} else continue; | |
} | |
word[a] = ch; | |
a++; | |
if (a >= MAX_STRING - 1) a--; // Truncate too long words | |
} | |
word[a] = 0; | |
} | |
// Returns hash value of a word | |
int GetWordHash(char *word) { | |
unsigned long long a, hash = 0; | |
for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a]; // How Hash compute? | |
hash = hash % vocab_hash_size; | |
return hash; | |
} | |
// Returns position of a word in the vocabulary; if the word is not found, returns -1 | |
// Linear Probing, Open Addressing | |
int SearchVocab(char *word) { | |
unsigned int hash = GetWordHash(word); | |
while (1) { | |
if (vocab_hash[hash] == -1) return -1; // None error | |
if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; //Return Index in Vacabulary Table | |
hash = (hash + 1) % vocab_hash_size; | |
} | |
return -1; | |
// Reads a Word and Returns Its Index in the Vocabulary | |
int ReadWordIndex(FILE *fin) { | |
char word[MAX_STRING]; | |
ReadWord(word, fin); | |
if (feof(fin)) return -1; | |
return SearchVocab(word); | |
} | |
// Adds a Word to Vocabulary | |
int AddWordToVocab(char *word) { | |
unsigned int hash, length = strlen(word) + 1; | |
if (length > MAX_STRING) length = MAX_STRING; | |
vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); //Store Words | |
strcpy(vocab[vocab_size].word, word); | |
vocab[vocab_size].cn = 0; // Default Value = 0 | |
vocab_size++; //Current Words in Vocabulary + 1 | |
// Reallocate Memory if Needed | |
if (vocab_size + 2 >= vocab_max_size) { | |
vocab_max_size += 1000; // Increase 1000 Each Time | |
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); | |
} | |
hash = GetWordHash(word); // Get Hash Value | |
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; // Linear Probing | |
vocab_hash[hash] = vocab_size - 1; // Record Position in Vocabulary Table | |
return vocab_size - 1; // Return Its Position in Vocabulary Table | |
// Used Later for Sorting by Word Counts | |
int VocabCompare(const void *a, const void *b) { | |
return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn; | |
} | |
// Sorts the vocabulary by frequency using word counts | |
void SortVocab() { | |
int a, size; | |
unsigned int hash; | |
// Sort the vocabulary and keep </s> at the first position | |
// Reserve 'Enter' at First Position | |
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); // Quick Sort on Vocabulary Table | |
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; // Quick Sort Messes Up Hash, Rebuild Later | |
size = vocab_size; | |
train_words = 0; // Training Words Number(Accumulation of Word Frequency) | |
for (a = 0; a < size; a++) { | |
// Words Cccuring Less Than min_count Times Will Be Discarded From the vocab | |
if (vocab[a].cn < min_count) { | |
vocab_size--; | |
free(vocab[vocab_size].word); | |
} else { | |
// Hash Will Be Re-computed, As After the Sorting It is Not Actual | |
hash=GetWordHash(vocab[a].word); | |
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; | |
vocab_hash[hash] = a; | |
train_words += vocab[a].cn; // Accumulation of Word Frequency | |
} | |
} | |
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); // Collect Spare Memory | |
// Allocate Memory For the Binary Tree Construction | |
for (a = 0; a < vocab_size; a++) { | |
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); | |
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); | |
} | |
} | |
// Reduces the Vocabulary by Removing Infrequent Tokens | |
void ReduceVocab() { | |
int a, b = 0; | |
unsigned int hash; | |
for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { | |
vocab[b].cn = vocab[a].cn; | |
vocab[b].word = vocab[a].word; | |
b++; | |
} else free(vocab[a].word); | |
vocab_size = b; // B Remaining Words, Both of Them > min_reduce | |
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; | |
for (a = 0; a < vocab_size; a++) { | |
// Hash Will Be Re-computed, As It Is Not Actual | |
hash = GetWordHash(vocab[a].word); | |
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; | |
vocab_hash[hash] = a; | |
} | |
fflush(stdout); | |
min_reduce++; | |
} | |
// Create Binary Huffman Tree Using the Word Counts | |
// Frequent Words will have Short Uniqe Binary Codes | |
void CreateBinaryTree() { | |
long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; | |
char code[MAX_CODE_LENGTH]; | |
long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); | |
long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); | |
long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long)); | |
for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn; | |
for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15; | |
pos1 = vocab_size - 1; | |
pos2 = vocab_size; | |
// Following Algorithm Constructs The Huffman tree By Adding One Node At A Time | |
for (a = 0; a < vocab_size - 1; a++) { | |
// First, Find Two Smallest Nodes 'min1, min2' | |
if (pos1 >= 0) { | |
if (count[pos1] < count[pos2]) { | |
min1i = pos1; | |
pos1--; | |
} else { | |
min1i = pos2; | |
pos2++; | |
} | |
} else { | |
min1i = pos2; | |
pos2++; | |
} | |
if (pos1 >= 0) { | |
if (count[pos1] < count[pos2]) { | |
min2i = pos1; | |
pos1--; | |
} else { | |
min2i = pos2; | |
pos2++; | |
} | |
} else { | |
min2i = pos2; | |
pos2++; | |
} | |
count[vocab_size + a] = count[min1i] + count[min2i]; | |
parent_node[min1i] = vocab_size + a; | |
parent_node[min2i] = vocab_size + a; | |
binary[min2i] = 1; | |
} | |
// Now Assign Binary Code to Each Vocabulary Word | |
for (a = 0; a < vocab_size; a++) { | |
b = a; | |
i = 0; | |
while (1) { | |
code[i] = binary[b]; // Assign Code | |
point[i] = b; // Assign Code Through Path, Itself First | |
i++; | |
b = parent_node[b]; | |
if (b == vocab_size * 2 - 2) break; | |
} | |
// Notice that Point Is One Layer Deeper Than Code At The Same Posiiton | |
vocab[a].codelen = i; // Not Count for Root | |
vocab[a].point[0] = vocab_size - 2; // Reverse Order,Assign First One as Root (2*vocab_size - 2 - vocab_size) | |
for (b = 0; b < i; b++) { // Reverse Order | |
vocab[a].code[i - b - 1] = code[b]; // No Root, Left Child = 0, Right Child = 1 | |
vocab[a].point[i - b] = point[b] - vocab_size; // Notice Last Element In point Array Is Negative, Is Meaningless | |
} | |
} | |
free(count); | |
free(binary); | |
free(parent_node); | |
} | |
// Loading Training File To Vocabulary Table | |
void LearnVocabFromTrainFile() { | |
char word[MAX_STRING]; | |
FILE *fin; | |
long long a, i; | |
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; | |
fin = fopen(train_file, "rb"); | |
if (fin == NULL) { | |
printf("ERROR: training data file not found!\n"); | |
exit(1); | |
} | |
vocab_size = 0; | |
AddWordToVocab((char *)"</s>"); // Append 'Enter' First | |
while (1) { | |
ReadWord(word, fin); | |
if (feof(fin)) break; | |
train_words++; | |
if ((debug_mode > 1) && (train_words % 100000 == 0)) { | |
printf("%lldK%c", train_words / 1000, 13); | |
fflush(stdout); | |
} | |
i = SearchVocab(word); | |
if (i == -1) { // If Not Exist, Add It to Hash Table | |
a = AddWordToVocab(word); | |
vocab[a].cn = 1; | |
} else vocab[i].cn++; // Otherwise, Word Frequency + 1 | |
if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); // Extend Size If Needed | |
} | |
SortVocab(); // Sort ALl Word According Word Frequeny | |
if (debug_mode > 0) { | |
printf("Vocab size: %lld\n", vocab_size); | |
printf("Words in train file: %lld\n", train_words); | |
} | |
file_size = ftell(fin); // File Size | |
fclose(fin); | |
} | |
// Output Word and Frequency to File | |
void SaveVocab() { | |
long long i; | |
FILE *fo = fopen(save_vocab_file, "wb"); | |
for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); | |
fclose(fo); | |
} | |
// Loading Vocabulary File To Vocabulary Table | |
void ReadVocab() { | |
long long a, i = 0; | |
char c; | |
char word[MAX_STRING]; | |
FILE *fin = fopen(read_vocab_file, "rb"); | |
if (fin == NULL) { | |
printf("Vocabulary file not found\n"); | |
exit(1); | |
} | |
for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; | |
vocab_size = 0; | |
while (1) { | |
ReadWord(word, fin); | |
if (feof(fin)) break; | |
a = AddWordToVocab(word); | |
fscanf(fin, "%lld%c", &vocab[a].cn, &c); | |
i++; | |
} | |
SortVocab(); | |
if (debug_mode > 0) { | |
printf("Vocab size: %lld\n", vocab_size); | |
printf("Words in train file: %lld\n", train_words); | |
} | |
fin = fopen(train_file, "rb"); // Open File to Get File Size | |
if (fin == NULL) { | |
printf("ERROR: training data file not found!\n"); | |
exit(1); | |
} | |
fseek(fin, 0, SEEK_END); | |
file_size = ftell(fin); | |
fclose(fin); | |
} | |
// Net Initialzation | |
void InitNet() { | |
long long a, b; | |
// posix_memalign() Return Number of 'size' Dynamic Address When Allocate Successful | |
// syn0 Word Vectors | |
a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); | |
if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} | |
if (hs) { // Hierarchical Softmax | |
// Using syn1 in Hierarchical Softmax | |
a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); | |
if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} | |
for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) | |
syn1[a * layer1_size + b] = 0; | |
} | |
if (negative>0) { // Negative Sampling | |
// Using syn1neg in Negative Sampling | |
a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); | |
if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} | |
for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) | |
syn1neg[a * layer1_size + b] = 0; | |
} | |
for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) | |
syn0[a * layer1_size + b] = (rand() / (real)RAND_MAX - 0.5) / layer1_size; // Random Intialization of syn0 | |
CreateBinaryTree(); // Creating Huffman Tree | |
} | |
void *TrainModelThread(void *id) { | |
// word Add Word to sen; Represent Current Word When Sentence Completed | |
// last_word Prior Word, Used for Window Scan | |
// sentence_length Current Sentence Length | |
// sentence_position Current Word's Index In Sentence | |
long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0; | |
// word_count Trained Word Count | |
// sen Sentence Array | |
long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; | |
// l1 Starting Position of word in Concatenated Word Vectors in NS | |
// l2 Starting Position of word in Concatenated Word Vectors in CBOW | |
// c Count Mark | |
// target Current Sample in NS | |
// label urrent Sample's Label in NS | |
long long l1, l2, c, target, label; | |
// id Receiveing When Create Thread For Random Number Generating | |
unsigned long long next_random = (long long)id; | |
// f e^x / (1/e^x),Probability of Current Code = 0 in HS; Probability of label = 1 in NS | |
// g error(|f - True Value|) * Learning Rate | |
real f, g; | |
// Starting Time | |
clock_t now; | |
real *neu1 = (real *)calloc(layer1_size, sizeof(real)); // Hidden Node | |
real *neu1e = (real *)calloc(layer1_size, sizeof(real)); // Error Accumulation | |
FILE *fi = fopen(train_file, "rb"); | |
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); // Distribute File to Each Thread | |
while (1) { | |
if (word_count - last_word_count > 10000) { | |
word_count_actual += word_count - last_word_count; | |
last_word_count = word_count; | |
if ((debug_mode > 1)) { | |
now=clock(); | |
printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, | |
word_count_actual / (real)(train_words + 1) * 100, | |
word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); | |
fflush(stdout); | |
} | |
alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1)); // Adjusting Learning Rate | |
if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; // Lower Bound of Learning Rate | |
} | |
if (sentence_length == 0) { // If Current Sentence Length = 0 | |
while (1) { | |
word = ReadWordIndex(fi); | |
if (feof(fi)) break; // Till End of File | |
if (word == -1) continue; // Word Not Exist | |
word_count++; | |
if (word == 0) break; // is 'Enter' | |
// The subsampling randomly discards frequent words while keeping the ranking same | |
// Discard High Frequency Words with High Probability And Vice Versa | |
if (sample > 0) { | |
real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; | |
next_random = next_random * (unsigned long long)25214903917 + 11; | |
if (ran < (next_random & 0xFFFF) / (real)65536) continue; | |
} | |
sen[sentence_length] = word; | |
sentence_length++; | |
if (sentence_length >= MAX_SENTENCE_LENGTH) break; | |
} | |
sentence_position = 0; | |
} | |
if (feof(fi)) break; // If Arrive End of File, Then Exit | |
if (word_count > train_words / num_threads) break; // If Complete Assigned Work, Then Exit | |
word = sen[sentence_position]; // Pick First Word of Sentence | |
if (word == -1) continue; // If Word Not Exist, Then Continue | |
// Reset Error Accumulation | |
for (c = 0; c < layer1_size; c++) neu1[c] = 0; | |
for (c = 0; c < layer1_size; c++) neu1e[c] = 0; | |
next_random = next_random * (unsigned long long)25214903917 + 11; | |
b = next_random % window; // b is Random Number which from 0 to window-1, Representing Acutal Window Size Currently | |
// CBOW | |
if (cbow) { //Train the Cbow Architecture | |
// in -> hidden | |
// Accumulate word vecotr in Current Window to Hidden Node | |
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { | |
c = sentence_position - window + a; | |
if (c < 0) continue; | |
if (c >= sentence_length) continue; | |
last_word = sen[c]; | |
if (last_word == -1) continue; // Word Not Exist | |
for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; | |
} | |
// HS | |
if (hs) for (d = 0; d < vocab[word].codelen; d++) { | |
f = 0; | |
l2 = vocab[word].point[d] * layer1_size; // Node Through Path | |
// Propagate hidden -> output | |
// Prepare For Computing f | |
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; | |
// Discard if Cross Boundries | |
// if (f <= -MAX_EXP) continue; | |
// else if (f >= MAX_EXP) continue; | |
if (f <= -MAX_EXP) f = 0; | |
else if (f >= MAX_EXP) f = 1; | |
// Find Cash Value From Expoential Table | |
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | |
// 'g' is the Gradient Multiplied by the Learning Rate | |
g = (1 - vocab[word].code[d] - f) * alpha; | |
// Propagate Errors Output -> Hidden | |
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; | |
// Learn weights hidden -> output | |
// Update Weight Which From Hidden Layer to Non-Leaf Node in Huffman tree | |
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; | |
} | |
// NEGATIVE SAMPLING | |
if (negative > 0) for (d = 0; d < negative + 1; d++) { | |
if (d == 0) { // Output 1 in Current Class | |
target = word; | |
label = 1; | |
} else { | |
next_random = next_random * (unsigned long long)25214903917 + 11; | |
target = table[(next_random >> 16) % table_size]; | |
if (target == 0) target = next_random % (vocab_size - 1) + 1; | |
if (target == word) continue; | |
label = 0; | |
} | |
l2 = target * layer1_size; | |
f = 0; | |
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; | |
if (f > MAX_EXP) g = (label - 1) * alpha; | |
else if (f < -MAX_EXP) g = (label - 0) * alpha; | |
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; | |
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; | |
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; | |
} | |
// hidden -> in | |
// Update word vectors According Error Accumulation In Hidden Layer | |
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { | |
c = sentence_position - window + a; | |
if (c < 0) continue; | |
if (c >= sentence_length) continue; | |
last_word = sen[c]; | |
if (last_word == -1) continue; | |
for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; | |
} | |
} else { //train skip-gram | |
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { // Prdict Context Word | |
c = sentence_position - window + a; | |
if (c < 0) continue; | |
if (c >= sentence_length) continue; | |
last_word = sen[c]; | |
if (last_word == -1) continue; | |
l1 = last_word * layer1_size; | |
// Reset Error Accumulation = 0 | |
for (c = 0; c < layer1_size; c++) neu1e[c] = 0; | |
// HIERARCHICAL SOFTMAX | |
if (hs) for (d = 0; d < vocab[word].codelen; d++) { | |
f = 0; | |
l2 = vocab[word].point[d] * layer1_size; | |
// Propagate hidden -> output | |
// Dot Product of Predict Word and Non-Leaf Node in Huffman Tree | |
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; | |
// if (f <= -MAX_EXP) continue; | |
// else if (f >= MAX_EXP) continue; | |
if (f <= -MAX_EXP) f = 0; | |
else if (f >= MAX_EXP) f = 1; | |
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | |
// 'g' is the gradient multiplied by the learning rate | |
g = (1 - vocab[word].code[d] - f) * alpha; | |
// Propagate errors output -> hidden | |
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; | |
// Learn weights hidden -> output | |
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; | |
} | |
// NEGATIVE SAMPLING | |
if (negative > 0) for (d = 0; d < negative + 1; d++) { | |
if (d == 0) { | |
target = word; | |
label = 1; | |
} else { | |
next_random = next_random * (unsigned long long)25214903917 + 11; | |
target = table[(next_random >> 16) % table_size]; | |
if (target == 0) target = next_random % (vocab_size - 1) + 1; | |
if (target == word) continue; | |
label = 0; | |
} | |
l2 = target * layer1_size; | |
f = 0; | |
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; | |
// Same As CBOW | |
if (f > MAX_EXP) g = (label - 1) * alpha; | |
else if (f < -MAX_EXP) g = (label - 0) * alpha; | |
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; | |
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; | |
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; | |
} | |
// Learn weights input -> hidden | |
for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; | |
} | |
} | |
sentence_position++; | |
if (sentence_position >= sentence_length) { | |
sentence_length = 0; | |
continue; | |
} | |
} | |
fclose(fi); | |
free(neu1); | |
free(neu1e); | |
pthread_exit(NULL); | |
} | |
void TrainModel() { | |
long a, b, c, d; | |
FILE *fo; | |
// Create Multiple Thread | |
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); | |
printf("Starting training using file %s\n", train_file); | |
starting_alpha = alpha; | |
// Load From Vacabulary File First, Or Load From Traning File Otherwise | |
if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); | |
// Ouput to Vocabulary File with Word And Frequency | |
if (save_vocab_file[0] != 0) SaveVocab(); | |
if (output_file[0] == 0) return; | |
InitNet(); // Network Initialization | |
if (negative > 0) InitUnigramTable(); // Sampling According Word Frequency | |
start = clock(); // Staring Clock | |
for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); | |
for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); | |
// Training Done, To Save | |
fo = fopen(output_file, "wb"); | |
if (classes == 0) { // Save word vectors | |
// Save the word vectors | |
fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); // Dim of vector | |
for (a = 0; a < vocab_size; a++) { | |
fprintf(fo, "%s ", vocab[a].word); | |
if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); | |
else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); | |
fprintf(fo, "\n"); | |
} | |
} else { | |
// Run K-means On The Word Vectors | |
int clcn = classes, iter = 10, closeid; | |
int *centcn = (int *)malloc(classes * sizeof(int)); | |
int *cl = (int *)calloc(vocab_size, sizeof(int)); | |
real closev, x; | |
real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); | |
for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; | |
for (a = 0; a < iter; a++) { | |
for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; | |
for (b = 0; b < clcn; b++) centcn[b] = 1; | |
for (c = 0; c < vocab_size; c++) { | |
for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; | |
centcn[cl[c]]++; | |
} | |
for (b = 0; b < clcn; b++) { | |
closev = 0; | |
for (c = 0; c < layer1_size; c++) { | |
cent[layer1_size * b + c] /= centcn[b]; | |
closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; | |
} | |
closev = sqrt(closev); | |
for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; | |
} | |
for (c = 0; c < vocab_size; c++) { | |
closev = -10; | |
closeid = 0; | |
for (d = 0; d < clcn; d++) { | |
x = 0; | |
for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; | |
if (x > closev) { | |
closev = x; | |
closeid = d; | |
} | |
} | |
cl[c] = closeid; | |
} | |
} | |
// Save the K-means classes | |
for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); | |
free(centcn); | |
free(cent); | |
free(cl); | |
} | |
fclose(fo); | |
} | |
int ArgPos(char *str, int argc, char **argv) { | |
int a; | |
for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) { | |
if (a == argc - 1) { | |
printf("Argument missing for %s\n", str); | |
exit(1); | |
} | |
return a; | |
} | |
return -1; | |
} | |
int main(int argc, char **argv) { | |
int i; | |
if (argc == 1) { | |
printf("WORD VECTOR estimation toolkit v 0.1b\n\n"); | |
printf("Options:\n"); | |
printf("Parameters for training:\n"); | |
printf("\t-train <file>\n"); // Specify Trainging File | |
printf("\t\tUse text data from <file> to train the model\n"); | |
printf("\t-output <file>\n"); // Specify Vocabulary File | |
printf("\t\tUse <file> to save the resulting word vectors / word clusters\n"); | |
printf("\t-size <int>\n"); // Dim of word vector,Corresponds to layer1_size, Defualt Value = 100 | |
printf("\t\tSet size of word vectors; default is 100\n"); // Context Window Size | |
printf("\t-window <int>\n"); | |
printf("\t\tSet max skip length between words; default is 5\n"); | |
printf("\t-sample <float>\n"); // Decline Probability in Sub-sampling | |
printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency"); | |
printf(" in the training data will be randomly down-sampled; default is 0 (off), useful value is 1e-5\n"); | |
printf("\t-hs <int>\n"); // Using HS, Default Value = 1 | |
printf("\t\tUse Hierarchical Softmax; default is 1 (0 = not used)\n"); | |
printf("\t-negative <int>\n"); // Sample Numbers in NS | |
printf("\t\tNumber of negative examples; default is 0, common values are 5 - 10 (0 = not used)\n"); | |
printf("\t-threads <int>\n"); // Thread Number | |
printf("\t\tUse <int> threads (default 1)\n"); | |
printf("\t-min-count <int>\n"); // Threshold of Minimum Word Number | |
printf("\t\tThis will discard words that appear less than <int> times; default is 5\n"); | |
printf("\t-alpha <float>\n"); // Starting Learning Rate, Default Value = 0.025 | |
printf("\t\tSet the starting learning rate; default is 0.025\n"); | |
printf("\t-classes <int>\n"); // Number of Output Word Class, Default Value = 0 -> No Output | |
printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); | |
printf("\t-debug <int>\n"); // Debug Parameter | |
printf("\t\tSet the debug mode (default = 2 = more info during training)\n"); | |
printf("\t-binary <int>\n"); // binary 0 -> vectors.bin Output Binary Stream; binary 1 -> Output Text Format | |
printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); | |
printf("\t-save-vocab <file>\n"); // Vocabulary File | |
printf("\t\tThe vocabulary will be saved to <file>\n"); | |
printf("\t-read-vocab <file>\n"); // Vocabulary Loading File | |
printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n"); | |
printf("\t-cbow <int>\n"); // Using CBOW | |
printf("\t\tUse the continuous bag of words model; default is 0 (skip-gram model)\n"); | |
printf("\nExamples:\n"); // Examples | |
printf("./word2vec -train data.txt -output vec.txt -debug 2 -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1\n\n"); | |
return 0; | |
} | |
// File Name Is Null | |
output_file[0] = 0; | |
save_vocab_file[0] = 0; | |
read_vocab_file[0] = 0; | |
if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]); | |
if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]); | |
if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]); | |
if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); | |
if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); | |
if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]); | |
if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); | |
if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); | |
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); | |
vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int)); | |
expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); | |
// f Value from e^-6 to e^6 | |
for (i = 0; i < EXP_TABLE_SIZE; i++) { | |
expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table | |
expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) | |
} | |
TrainModel(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment