gsinclair/wordchains.cpp

## wordchains.cpp
/* Word chains (ORAC Challenge Problems 2)
 *
 * Build a tree with all the letters in it, marking the word depth and (boolean)
 * terminator status of each letter. Remember where the leaves are (in a set). Then it's
 * an easy matter to find the leaf with the greatest word depth, build the correct path,
 * and emit all the words in that path.
 *
 * Attempt 1 failed -- all tests had memory limit exceeded. No memory limit was stated in
 * the problem.
 *
 * Attempt 2 addressed the memory by removing two non-essential items from each node: the
 * id and the depth. Failed: timeout in every test.
 *
 * The only excessive algorithmic complexity I can think of is the linear lookup of the
 * children of each node.
 *
 * Attempt 3 introduces a map of children instead of a vector. The result of this is 'out
 * of memory' :(
 *
 * OK, I think the reason I'm getting out of memory is because I don't need so many nodes.
 * The problem statement says that all the input words combined will not exceed 2,000,000
 * letters. I've been pre-allocating that many nodes. But many of those letters will
 * already be in the tree as it grows, meaning a new node is not needed all the time.
 *
 * I can't know in advance the maximum possible number of nodes required, so I'll have to
 * use a vector, I suppose.
 *
 * Attempt 4: use a vector of nodes instead of a pre-allocated array. This means creating
 * a constructor for nodes so I can use emplace_back instead of push_back. It also means
 * removing NEXT_NODE_ID.
 *
 *    --> score 80% with two timeouts
 *
 * I don't know why timeouts would occur. Could be a technicality (ORAC isn't as fast as
 * the competition server) or it could be that using a map for children isn't as fast as
 * vector, although I doubt it.
 *
 * Attempt 5: remove the LEAVES set and just do a full linear search for the deepest node
 * after the tree is built. This makes sense, actually. Result: exactly as above.
 *
 * Attempt 6: keep track of maximum word depth as we build the tree. These are in the
 * global variables MAX_WORD_DEPTH and MAX_WORD_DEPTH_ID. Result: exactly as above!
 *
 * Attempt 7: replace the map of children with a 26-int array. That may be a bit wasteful
 * of memory, but probably not that much, and it should definitely give faster lookup.
 *
 *    --> score: 100% !!!
 *
 * Everything _was_ much faster with the array lookup. About 3.5x faster.
 */
#include <bits/stdc++.h>
using namespace std;

typedef struct Node {
  int  parent_id;
  int  children[26];
  char letter;
  bool terminator;
  int  word_depth;
  Node(int pid, char l, bool t, int wd)
    : parent_id(pid), letter(l), terminator(t), word_depth(wd) {
      for (int i = 0; i < 26; i++)
        this->children[i] = -1;
    };
} node;

// The nodes of our tree.
vector<node> NODES;

// (Unfortunately) global variables to keep track of the node with the greatest word depth.
int MAX_WORD_DEPTH = 0;
int MAX_WORD_DEPTH_ID = 0;

// -------------------------------------------------- Tree building functions

void init_tree_root_node() {
  NODES.emplace_back(-1, '-', false, 0);
}

void add_child(int parent_id, char letter, int child_id) {
  int key = letter - 'a';
  NODES[parent_id].children[key] = child_id;
}

// Creates a new node in NODES.
// LEAVES is not touched by this; the caller of this function has responsibility for that.
// The new node has the word_depth assigned, unless it's a terminator, in which case it
// gets one more than that.
// Returns the id of the created node.
int new_node(int parent_id, char letter, bool terminator, int word_depth) {
  NODES.emplace_back(parent_id, letter, terminator, word_depth);
  int new_id = NODES.size() - 1;
  if (terminator) {
    NODES[new_id].word_depth++;
    // We may have a new greatest word depth.
    if (NODES[new_id].word_depth > MAX_WORD_DEPTH) {
      MAX_WORD_DEPTH    = NODES[new_id].word_depth;
      MAX_WORD_DEPTH_ID = new_id;
    }
  }
  add_child(parent_id, letter, new_id);
  return new_id;
}

// Returns the id of the child node matching the given letter, or -1 if none exists.
int find_child(int pid, char letter) {
  int key = letter - 'a';
  return NODES[pid].children[key];
}

void insert(char *word) {
  int id = 0;                   // Start at root.
  int pid = -1;                 // Keep track of parent.
  int L = strlen(word);         // The length of the word - we need to know when we have finshed.
  int idx = 0;
  // Find the end of the tree for this word.
  while (true) {
    if (word[idx] == 0) break;  // This should never occur. We should run out of tree before we run out of word.
    pid = id;                   // Update parent.
    id  = find_child(id, word[idx]);
    if (id == -1) break;        // End of the tree.
    idx++;                      // Move on to next character.
  }
  // We are at the end of the tree, and idx marks the first character that needs to be inserted.
  // Currently id == -1 and pid is the leaf node.
  // We add the rest of the characters, but must prepare some data first.
  id = pid;
  int word_depth = NODES[id].word_depth;
  while (idx < L) {
    bool terminator = (idx == L-1);
    id = new_node(id, word[idx], terminator, word_depth);
    idx++;
  }
}

// -------------------------------------------------- Problem-solving functions

deque<int> path_to_node(int target_id) {
  deque<int> path;
  int id = target_id;
  while (id != 0) {
    path.push_front(id);
    id = NODES[id].parent_id;
  }
  return path;
}

char BUFFER[100];

void print_words_in_path(deque<int> path, ofstream &out) {
  char *c = BUFFER;
  for (int id : path) {
    node &n = NODES[id];
    *c++ = n.letter;
    if (n.terminator)
      out << BUFFER << endl;
  }
}

// -------------------------------------------------- Debugging functions

void print_tree() {
  for (int id = 0; id < NODES.size(); id++) {
    node &n = NODES[id];
    if (n.terminator)
      printf("[%2d] '%c' * %d\n", id, n.letter, n.word_depth);
    else
      printf("[%2d] '%c'   %d\n", id, n.letter, n.word_depth);
  }
}

// -------------------------------------------------- Testing functions

// -------------------------------------------------- Main

int main() {
  // Open the I/O files.
  ifstream fin("chains.in");
  ofstream fout("chains.out");

  // Initialise the tree.
  init_tree_root_node();

  char word[100];       // Words have length 1..75.

  // Read the input and build the tree.
  while (true) {
    fin >> word;
    if (strcmp(word, ".") == 0)
      break;
    else
      insert(word);
  }

  // Find the path to the desired leaf node.
  auto path = path_to_node(MAX_WORD_DEPTH_ID);

  // Print the solution (all words implied by the path).
  print_words_in_path(path, fout);
}
	/* Word chains (ORAC Challenge Problems 2)
	*
	* Build a tree with all the letters in it, marking the word depth and (boolean)
	* terminator status of each letter. Remember where the leaves are (in a set). Then it's
	* an easy matter to find the leaf with the greatest word depth, build the correct path,
	* and emit all the words in that path.
	*
	* Attempt 1 failed -- all tests had memory limit exceeded. No memory limit was stated in
	* the problem.
	*
	* Attempt 2 addressed the memory by removing two non-essential items from each node: the
	* id and the depth. Failed: timeout in every test.
	*
	* The only excessive algorithmic complexity I can think of is the linear lookup of the
	* children of each node.
	*
	* Attempt 3 introduces a map of children instead of a vector. The result of this is 'out
	* of memory' :(
	*
	* OK, I think the reason I'm getting out of memory is because I don't need so many nodes.
	* The problem statement says that all the input words combined will not exceed 2,000,000
	* letters. I've been pre-allocating that many nodes. But many of those letters will
	* already be in the tree as it grows, meaning a new node is not needed all the time.
	*
	* I can't know in advance the maximum possible number of nodes required, so I'll have to
	* use a vector, I suppose.
	*
	* Attempt 4: use a vector of nodes instead of a pre-allocated array. This means creating
	* a constructor for nodes so I can use emplace_back instead of push_back. It also means
	* removing NEXT_NODE_ID.
	*
	* --> score 80% with two timeouts
	*
	* I don't know why timeouts would occur. Could be a technicality (ORAC isn't as fast as
	* the competition server) or it could be that using a map for children isn't as fast as
	* vector, although I doubt it.
	*
	* Attempt 5: remove the LEAVES set and just do a full linear search for the deepest node
	* after the tree is built. This makes sense, actually. Result: exactly as above.
	*
	* Attempt 6: keep track of maximum word depth as we build the tree. These are in the
	* global variables MAX_WORD_DEPTH and MAX_WORD_DEPTH_ID. Result: exactly as above!
	*
	* Attempt 7: replace the map of children with a 26-int array. That may be a bit wasteful
	* of memory, but probably not that much, and it should definitely give faster lookup.
	*
	* --> score: 100% !!!
	*
	* Everything _was_ much faster with the array lookup. About 3.5x faster.
	*/
	#include <bits/stdc++.h>
	using namespace std;

	typedef struct Node {
	int parent_id;
	int children[26];
	char letter;
	bool terminator;
	int word_depth;
	Node(int pid, char l, bool t, int wd)
	: parent_id(pid), letter(l), terminator(t), word_depth(wd) {
	for (int i = 0; i < 26; i++)
	this->children[i] = -1;
	};
	} node;

	// The nodes of our tree.
	vector<node> NODES;

	// (Unfortunately) global variables to keep track of the node with the greatest word depth.
	int MAX_WORD_DEPTH = 0;
	int MAX_WORD_DEPTH_ID = 0;

	// -------------------------------------------------- Tree building functions

	void init_tree_root_node() {
	NODES.emplace_back(-1, '-', false, 0);
	}

	void add_child(int parent_id, char letter, int child_id) {
	int key = letter - 'a';
	NODES[parent_id].children[key] = child_id;
	}

	// Creates a new node in NODES.
	// LEAVES is not touched by this; the caller of this function has responsibility for that.
	// The new node has the word_depth assigned, unless it's a terminator, in which case it
	// gets one more than that.
	// Returns the id of the created node.
	int new_node(int parent_id, char letter, bool terminator, int word_depth) {
	NODES.emplace_back(parent_id, letter, terminator, word_depth);
	int new_id = NODES.size() - 1;
	if (terminator) {
	NODES[new_id].word_depth++;
	// We may have a new greatest word depth.
	if (NODES[new_id].word_depth > MAX_WORD_DEPTH) {
	MAX_WORD_DEPTH = NODES[new_id].word_depth;
	MAX_WORD_DEPTH_ID = new_id;
	}
	}
	add_child(parent_id, letter, new_id);
	return new_id;
	}

	// Returns the id of the child node matching the given letter, or -1 if none exists.
	int find_child(int pid, char letter) {
	int key = letter - 'a';
	return NODES[pid].children[key];
	}

	void insert(char *word) {
	int id = 0; // Start at root.
	int pid = -1; // Keep track of parent.
	int L = strlen(word); // The length of the word - we need to know when we have finshed.
	int idx = 0;
	// Find the end of the tree for this word.
	while (true) {
	if (word[idx] == 0) break; // This should never occur. We should run out of tree before we run out of word.
	pid = id; // Update parent.
	id = find_child(id, word[idx]);
	if (id == -1) break; // End of the tree.
	idx++; // Move on to next character.
	}
	// We are at the end of the tree, and idx marks the first character that needs to be inserted.
	// Currently id == -1 and pid is the leaf node.
	// We add the rest of the characters, but must prepare some data first.
	id = pid;
	int word_depth = NODES[id].word_depth;
	while (idx < L) {
	bool terminator = (idx == L-1);
	id = new_node(id, word[idx], terminator, word_depth);
	idx++;
	}
	}

	// -------------------------------------------------- Problem-solving functions

	deque<int> path_to_node(int target_id) {
	deque<int> path;
	int id = target_id;
	while (id != 0) {
	path.push_front(id);
	id = NODES[id].parent_id;
	}
	return path;
	}

	char BUFFER[100];

	void print_words_in_path(deque<int> path, ofstream &out) {
	char *c = BUFFER;
	for (int id : path) {
	node &n = NODES[id];
	*c++ = n.letter;
	if (n.terminator)
	out << BUFFER << endl;
	}
	}

	// -------------------------------------------------- Debugging functions

	void print_tree() {
	for (int id = 0; id < NODES.size(); id++) {
	node &n = NODES[id];
	if (n.terminator)
	printf("[%2d] '%c' * %d\n", id, n.letter, n.word_depth);
	else
	printf("[%2d] '%c' %d\n", id, n.letter, n.word_depth);
	}
	}

	// -------------------------------------------------- Testing functions

	// -------------------------------------------------- Main

	int main() {
	// Open the I/O files.
	ifstream fin("chains.in");
	ofstream fout("chains.out");

	// Initialise the tree.
	init_tree_root_node();

	char word[100]; // Words have length 1..75.

	// Read the input and build the tree.
	while (true) {
	fin >> word;
	if (strcmp(word, ".") == 0)
	break;
	else
	insert(word);
	}

	// Find the path to the desired leaf node.
	auto path = path_to_node(MAX_WORD_DEPTH_ID);

	// Print the solution (all words implied by the path).
	print_words_in_path(path, fout);
	}