CodeArchive SuffixTree

From Ubcacm
Jump to: navigation, search

Yury's implementation...

#include <cstdio>
#include <string>
#include <cctype>
using namespace std;

char s[1 << 20]; // the actual, global string
struct edge{
    int begin, end; edge *child[64], *slink, *parent;
    edge(int b = 0, int e = 0, edge *p = 0, edge *sl = 0)
   { memset(child, 0, sizeof(child)), slink = sl, begin = b, end = e, parent = p; }
    ~edge(){ for(int i = 0; i < 64; i++) if(child[i]) delete child[i]; }
};

void get_suffix(edge *&e, int &ep, int sp){ // s[e->begin + ep] == s[sp]
    if((e = e->parent)->slink) e = e->slink, ep += (e->end - e->begin);
    else ep += (e->end - e->begin - 1);
    while(ep > e->end - e->begin) ep -= (e->end - e->begin), e = e->child[s[sp - ep]];
}

edge *build_tree(){ // assume non-zero length for s
    edge *t = new edge(0, 0), *cur = t->child[s[0]] = new edge(0, (1 << 30), t);
    for(int phase = 1, pos = 1, f = 1; s[phase]; phase++, pos++){
   edge **needslink = 0;
   while(true){
       if(f) get_suffix(cur, pos, phase); else f = 1;
       if(cur->end == (1 << 30) && pos + cur->begin == phase) continue;
       if(cur->end - cur->begin > pos && s[cur->begin + pos] == s[phase] || 
          cur->child[s[phase]]) { f = 0; break; }
       if(needslink) *needslink = cur, needslink = 0;
       if(cur->end - cur->begin > pos){
      edge *oedge = new edge(pos, cur->end, cur, cur->slink);
      memcpy(oedge->child, cur->child, sizeof(cur->child));
      edge *nedge = new edge(phase, (1 << 30), cur); needslink = &(nedge->slink);
      cur->end = pos, memset(cur->child, 0, sizeof(cur->child));
      cur->child[s[phase]] = nedge, cur->child[s[cur->end]] = oedge;
       }else cur->child[s[phase]] = new edge(phase, (1 << 30), cur);
       if(cur->parent == 0 && !(f = 0)) break;
   }
    }
    return t;
}

bool find_str(edge *tree, char *w){
    if(*w == 0) return true;
    if(tree == 0) return false;
    for(int i = 0; i + tree->begin < tree->end; i++)
   if(w[i] == 0) return true;
   else if(s[tree->begin + i] != w[i]) return false;
    w += (tree->end - tree->begin);
    return find_str(tree->child[*w], w);  
}
Suffix Tree

Just implementation anyway...

#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <string>

using namespace std;

struct node_t {
   node_t *parent, *suffixLink;
   bool isLeaf;
   int sIdx, eIdx;
   node_t** children;
   node_t*& operator[] (const int k) {
      return children[k];
   }
  node_t(node_t* p, bool il, int ss, int ee):parent(p), suffixLink(NULL), isLeaf(il), sIdx(ss), eIdx(ee) {
    if (!il) {
      children = new node_t*[32];
      memset(children, 0, sizeof(children));
    }
  }
  ~node_t() {
    if (children) {
      for (int i = 0; i < 32; ++i) if (children[i]) delete children[i];
      delete [] children;
    }
};

// The str is available globally
node_t* build_suffix_tree(const char* str) {
  int n = strlen(str);
  node_t *root = new node_t(NULL, false, 0, 0);
  root->suffixLink = root; // Initialize suffix link
  (*root)[str[0]] = new node_t(root, true, 0, n);

  node_t* nextRoot = root, *unknown = NULL, *tmp = NULL;
  int lts = 0, ltc = n-1; // Those part that I can skip
  for(int i = 1; i < n; ++i) {
    tmp = nextRoot;
    if(lts > 0) { // Fill up the suffix link of the unknown
      while(lts >= // There must be an edge waiting for me
          (*tmp)[str[ n-ltc-lts ]]->eIdx - 
          (*tmp)[str[ n-ltc-lts ]]->sIdx  ) {
        // Skip as much as we know
        tmp = (*tmp)[str[ n-ltc-lts ]];
        lts -= (tmp->eIdx - tmp->sIdx);
      }

      if(lts > 0) { // Create an internal node
        node_t* newInternal = new node_t(tmp, false, n-ltc-lts, n-ltc);
        node_t* originalInternal = (*tmp)[str[ n-ltc-lts ]];
        originalInternal->parent = newInternal;
        originalInternal->sIdx += lts;
        (*tmp)[str[ n-ltc-lts ]] = newInternal;
        (*newInternal)[ str[originalInternal->sIdx] ] = originalInternal;
        tmp = newInternal;
        lts = 0;
      }
      unknown->suffixLink = tmp; // unknown != NULL, the suffixLink of unknown should be found here
    }

    while( ltc > 0 ) { // tmp is the default parent, tmp must be an internal node?
      node_t *nextTrial = (*tmp)[str[n-ltc]];
      if(nextTrial == NULL) break; // There is no match already!
      int idx = nextTrial->sIdx;
      for(; idx < nextTrial->eIdx; ++idx)
        if( str[ n-ltc + (idx - nextTrial->sIdx) ] != str[idx] ) break;
      if(idx < nextTrial->eIdx) { // Ultra troublesome case! I have to create a new internal node
        node_t* newInternal = new node_t(tmp, false, nextTrial->sIdx, idx);
        (*tmp)[str[n-ltc]] = newInternal;
        (*newInternal)[str[idx]] = nextTrial;
        ltc -= (idx - nextTrial->sIdx);
        nextTrial->sIdx = idx;
        tmp = newInternal;
        break;
      } else { // idx == nextTrial->eIdx
        tmp = nextTrial;
        ltc -= (nextTrial->eIdx - nextTrial->sIdx);
      }
    }
    node_t* newLeaf = new node_t(tmp, true, n-ltc, n);
    if(ltc == 0) (*tmp)[ 31 ] = newLeaf;
    else (*tmp)[str[n-ltc]] = newLeaf;
      
    nextRoot = newLeaf->parent; // Find my parent first
    if(nextRoot->suffixLink) { // Suffix link found, I don't have unknown now
      unknown = NULL; lts = 0; // I can skip nothing
    } else {
      unknown = nextRoot; // Fill in the suffix Link here
      lts = nextRoot->eIdx - nextRoot->sIdx;
      nextRoot = nextRoot->parent;
    }
    if(nextRoot == root) { // I must skip 1 less
      if(lts > 0) --lts;
      ltc = n - lts - 1 - i;
    }
    nextRoot = nextRoot->suffixLink;
  }
}

int longest_prefix(node_t* root, const char* str, const char *test) {
  int idx = 0;
  for(;;) {
    if (root->isLeaf) return idx;
    node_t* next = (*root)[test[idx]];
    if (!next) return idx;
    for (int i = next->sIdx; test[idx] && i < next->eIdx; ++i, ++idx)
      if (str[i] != test[idx]) return idx;
    if (!test[idx]) return idx;
    root = next;
  }
}

int main() {
  return 0;
}

-- Main.MatthewChan - 29 Mar 2006