#include <string>
#include <vector>
#include <map>
#include <iostream>
#include <fstream>
#include <cassert>
using namespace std;
////////////////////////////////////////////////////////////////////////////////////
// Tree -- a 26-ary tree
class Tree
{
public:
Tree() {};
Tree(const Tree&);
~Tree();
Tree& operator=(const Tree&);
string *find(const string& key) const;
void insert(const string& key, const string& value);
void remove(const string& key);
string& operator[](const string& key); // find and insert (key,e) to tree if needed
void compactSubtrees(); // delete unused nodes
friend ostream& operator<< (ostream& os, Tree const& t);
bool isEmpty() const;
int nWords() const; // #keywords, i.e. #occupied nodes
void takeStat(map<int,int>& stSize, map<int,int>& stCapacity) const;
// take statistic of tree overhead size and capacity
static const char *firstWord(const char *s); // find first word in s
static int wordLength(const char *w); // find length of word w
// the following should be private; they are public for testing only
bool isCompact() const;
bool isLeaf() const;
bool isLeafOrCompact() { return isLeaf() || isCompact(); }
int nNodes() const;
private:
string data;
vector<Tree*> subtrees;
void clone(const Tree&);
bool isOccupied() const; // is this *node* occupied?
void list(ostream& os, string& key) const;// list to 'os' using 'key' as a working stack
Tree& find(const char *& key) const;
Tree& insert(const char *key, const string& s);
void remove(const char *key);
void compactSubtreesVector();
static const int nLetters = 'z' - 'a' + 1;// 26
static int encode(int x)
{
char enc[]=
// a b c d e f g h i j k l m n o p q r s t u v w x y z
{2,19,11, 9, 0,15,16, 7, 4,22,21,10,13, 5, 3,18,24, 8, 6, 1,12,20,14,23,17,25};
assert(isalpha(x));
return enc[tolower(x) - 'a'];
}
static int decode(int y)
{
char dec[]="etaoinshrdlcumwfgypbvkjxqz";
assert(0 <= y && y < nLetters);
return dec[y];
}
};
// Description
// There are two (conceptual) classes of trees:
// T : tree
// T': compact tree, i.e., one using minimal number of nodes to store given words.
// A tree is _compact_ iff its leaves are all occupied.
// Although compact tree is nice, it is hard to implement recursively
// since every tree must have at least one leaf => every compact tree
// is non-empty => when initialized, a compact tree is just a leaf and must
// be already occupied, e.g. by (emptykey,somevalue).
//
// T' is a proper subclass of T, i.e., T' < T.
//
// For T', member functions may assume the input tree is T' (and take
// andvantage of the assumption for to write simpler and faster code),
// public mutators (constructor(s), destructor, operator[], insert() etc),
// on the other hand, must guarantee that every output tree is indeed T'.
// For example:
//
// - While T::remove() may use the "lazy" strategy (i.e. no memory cleanup),
// T'::remove() must cleanup upon remove.
//
// - While T::insert() may insert value e to tree, T'::insert() may not.
// To save memory, a tree node has no bool member variable indicating
// its occupation, isOccupied() is implemented by testing for empty value.
// Thus every node containing (key,e) is considered unoccupied.
//
// - operator[] can be defined for T but not for T', since it inserts
// (key,e) pairs to tree t to ensure t[key] accessibility.
//
// Although the Tree in this implementation tries to be T', technically it is T,
// i.e.,
//
// - member functions do not assume compact input tree;
//
// - t.compactSubtrees() guarantees compactness on all subtrees; and
//
// - all other public mutators except t.insert(key,e) and t[non-existing key]
// preserve tree compactness if any.
Tree::Tree(const Tree& t)
{
clone(t);
}
Tree& Tree::operator=(const Tree& t)
{
if (this != &t)
{
this->~Tree();
clone(t);
}
return *this;
}
void Tree::clone(const Tree& t)
{
data = t.data;
assert(subtrees.empty());
subtrees.resize(t.subtrees.size(),NULL);
for (size_t k = 0; k < t.subtrees.size(); k++)
if (t.subtrees[k])
subtrees[k] = new Tree(*t.subtrees[k]);
}
Tree::~Tree()
{
for (size_t k=0; k<subtrees.size(); k++)
if (subtrees[k])
delete subtrees[k];
subtrees.clear();
}
string *Tree::find(const string& key) const
{
const char *key1 = key.c_str();
Tree& tree = find(key1);
if (*key1 || !tree.isOccupied()) return NULL;
return &tree.data;
}
Tree& Tree::find(const char *&key) const
{
int k;
if (*key && (k = encode(*key)) < int(subtrees.size()) && subtrees[k])
return subtrees[k]->find(++key);
return const_cast<Tree&>(*this);
}
string& Tree::operator[](const string& key)
{
const char *key1 = key.c_str();
Tree& tree = find(key1);
if (*key1 == 0) return tree.data;
return tree.insert(key1,"").data;
}
void Tree::insert(const string& key, const string& s)
{
insert(key.c_str(),s);
}
Tree& Tree::insert(const char *key, const string& s)
{
if (*key == 0)
{
data = s;
return *this;
}
else
{
int k = encode(*key);
if (!(k < int(subtrees.size())))
{
subtrees.resize(k+1,NULL);
}
if (!subtrees[k])
{
subtrees[k] = new Tree;
}
return subtrees[k]->insert(key+1,s);
}
}
void Tree::remove(const string& key)
{
remove(key.c_str());
}
void Tree::remove(const char *key)
{
int k;
if (*key == 0)
{
data.clear();
}
else if ((k=encode(*key)) < int(subtrees.size()) && subtrees[k])
{
subtrees[k]->remove(key+1);
if (subtrees[k]->isEmpty())
{
delete subtrees[k];
subtrees[k] = NULL;
compactSubtreesVector();
}
}
}
void Tree::compactSubtrees()
{
for(int k=0; k<int(subtrees.size()); k++)
{
if (!subtrees[k]) continue;
subtrees[k]->compactSubtrees();
if(subtrees[k]->isEmpty())
{
delete subtrees[k];
subtrees[k] = NULL;
}
}
compactSubtreesVector();
}
void Tree::compactSubtreesVector()
{ // minimalize size of the vector 'subtrees',
int k1 = -1;
for (int k = int(subtrees.size())-1; k>=0; --k)
if (subtrees[k]) {k1 = k; break;}
if (int(subtrees.size()) > k1+1)
subtrees.resize(k1+1);
}
ostream& operator << (ostream& os, Tree const& t)
{
string key("");
t.list(os, key);
return os;
}
void Tree::list(ostream& os, string& key) const
{
if (isOccupied())
{
os << key << ":";
os << data << endl;
}
for (int cx = 'a'; cx <= 'z'; cx++)
{
int cy = encode(cx);
if (int(subtrees.size())>cy && subtrees[cy])
{
key.push_back(cx);
subtrees[cy]->list(os, key);
key.erase(key.end()-1);
}
}
}
bool Tree::isOccupied() const
{
return !data.empty();
}
bool Tree::isEmpty() const
{
if (isOccupied()) return false;
for (size_t k=0; k<subtrees.size(); k++)
if (subtrees[k] && !subtrees[k]->isEmpty()) return false;
return true;
}
bool Tree::isLeaf() const
{
for (size_t k=0; k<subtrees.size(); k++)
if(subtrees[k]) return false;
return true;
}
bool Tree::isCompact() const
{
if (isLeaf()) return isOccupied();
for (size_t k=0; k<subtrees.size(); k++)
if (subtrees[k] && !subtrees[k]->isCompact()) return false;
return true;
}
int Tree::nWords() const
{
int n = isOccupied()? 1 : 0;
for (size_t k=0; k<subtrees.size(); k++)
if (subtrees[k]) n+=subtrees[k]->nWords();
return n;
}
int Tree::nNodes() const
{
int n=1;
for (size_t k=0; k<subtrees.size(); k++)
if (subtrees[k]) n+=subtrees[k]->nNodes();
return n;
}
void Tree::takeStat(map<int,int>& stSize, map<int,int>& stCapa) const
{
int dNodeSize = sizeof(*this) - sizeof(data) + sizeof(Tree*) * subtrees.size();
stSize[dNodeSize]++;
int dNodeCapa = sizeof(*this) - sizeof(data) + sizeof(Tree*) * subtrees.capacity();
stCapa[dNodeCapa]++;
//if(subtrees.size()==0) cout << this << endl;
for (size_t k=0; k<subtrees.size(); k++)
if (subtrees[k]) subtrees[k]->takeStat(stSize,stCapa);
}
const char *Tree::firstWord(const char *s)
{
while (*s && !isalpha(*s)) s++;
return s;
}
int Tree::wordLength(const char *w)
{
const char *u = w;
while (isalpha(*u)) u++;
return u - w;
}
//////////////////////////////////////////////////////////////////////////////////////////
// Tests
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <functional>
#include <vector>
#include <sstream>
#include <ctime>
#include <cstdlib>
#ifdef __GNUC__
#include <ext/hash_set>
#define set __gnu_cxx::hash_set
namespace __gnu_cxx
{
template<> struct hash< std::string >
{
size_t operator()( const std::string& s ) const
{
return hash< const char* >()( s.c_str() );
}
};
}
#else
#include <set>
#endif
bool operator == (Tree const& tr1, Tree const& tr2)
{
stringstream oss1(stringstream::in|stringstream::out);
stringstream oss2(stringstream::in|stringstream::out);
string s1,s2;
oss1 << tr1;
oss2 << tr2;
while (oss1 >> s1 && oss2 >> s2)
{
// cout << s1;
// cout << s2;
if (s1 != s2) return false;
}
return true;
}
unsigned genRandom(unsigned max) // generates a pseudo-random number in [0,max),
// 'max' and return value may be declared 'float'
{
static int const IM = 139968, IA = 3877, IC = 29573;
static int seed = 42;
return unsigned(double(max) * (seed = (seed * IA + IC) % IM) / IM);
// floating-point arithmetic to avoid overflow with large 'max'
}
ptrdiff_t random (ptrdiff_t i)
{
return genRandom(i);
}
ptrdiff_t (*p_random)(ptrdiff_t) = random;
int loadWords(vector<string>& v, string const& filename)
{
// parse a text file for words, store words to v uniquely
ifstream ifs(filename.c_str());
set<string> s;
string u, w;
const char *pw;
int n=0, k;
while (ifs >> u)
{
for(k = int(u.length()); k>=0; k--)
u[k] = tolower(u[k]);
for(pw = u.c_str(); *pw; pw+=k)
{
n++;
pw = Tree::firstWord(pw);
k = Tree::wordLength(pw);
w.assign(pw,k);
if (s.find(w)==s.end())
{
s.insert(w);
v.push_back(w);
}
//else cout << w << ":REPEATS ";
}
}
ifs.close();
cout << endl
<< filename
<< ":" << endl
<< "#words=" << n << " "
<< "#distinct words=" << v.size() << endl;
return s.size();
}
int listStat(map<int,int>& st)
{
map<int,int>::iterator i;
int m=0, n=0;
cout << setw
(10) << "nodesize" << setw
(10) << "#nodes" << setw(10) << "amount" << endl;
for (i=st.begin(); i!=st.end(); i++)
{
cout << setw
(10) << i
->first
<< setw
(10) << i
->second
<< setw(10) << (i->first * i->second) << endl;
m += i->first * i->second;
n += i->second;
}
cout << setw
(10) << "Total:" << setw
(10) << n
<< setw(10) << m << " bytes." << endl;
return m;
}
void transferData(vector<string>& v0, vector<string>& v01, vector<string>& v1)
{
// transfer a part of v0 to v1 and construct v01 as a delta list, i.e.
// v01 = a subset of v0, v0 -= v01, v1 += v01, v01 += some words from v1
v01.clear();
random_shuffle(v0.begin(), v0.end(), p_random);
for (int n = genRandom(v0.size()+1); n; n--)
{
string w = v0.back();
v0.pop_back();
v01.push_back(w);
}
v1.insert(v1.end(),v01.begin(),v01.end());
for (int n = genRandom(v1.size()/2); n; n--)
{
string w = v1[genRandom(v1.size())];
v01.push_back(w);
}
}
void treeInsert(Tree& t, vector<string>& v)
{
random_shuffle(v.begin(), v.end(), p_random);
for (size_t i=0; i<v.size(); i++)
t.insert(v[i],"giai nghia tieng Viet");
}
void treeInsert2(Tree& t, vector<string>& v)
{
random_shuffle(v.begin(), v.end(), p_random);
for (size_t i=0; i<v.size(); i++)
t[v[i]] = "giai nghia tieng Viet";
}
void treeRemove(Tree&t, vector<string>& v)
{
random_shuffle(v.begin(), v.end(), p_random);
for (size_t i=0; i<v.size(); i++)
t.remove(v[i]);
}
void treeRemove2(Tree&t, vector<string>& v)
{
random_shuffle(v.begin(), v.end(), p_random);
for (size_t i=0; i<v.size(); i++)
t[v[i]] = "";
t.compactSubtrees();
}
#define DISPLAYCOUNTS \
cout << "v0:" << v0.size() << " v01:" << v01.size() \
<< " v1:" << v1.size() \
<< " ta:" << ta.nWords() << "/" << ta.nNodes() \
<< " tb:" << tb.nWords() << "/" << tb.nNodes() << endl;
void testTree()
{
vector<string> v0; // the set of out-tree words
vector<string> v1; // the set of in-tree words
vector<string> v01; // the delta list, ie. list of words
// to be inserted to tree or removed from tree
map<int,int> st10size, st10capa, st90size, st90capa;
int nWords, nWords10, nWords90;
bool st10taken=false, st90taken=false;
Tree ta,tb;
//nWords = loadWords(v0, "english-1857w.txt");
//nWords = loadWords(v0, "english-058kw-corncob.txt");
//nWords = loadWords(v0, "english-087kw-british.txt");
//nWords = loadWords(v0, "english-110kw-SIL.txt");
nWords = loadWords(v0, "english-346kw-raw.txt");
//nWords = loadWords(v0, "english-spoken-written.txt");
//nWords = loadWords(v0, "english-british-cap.txt");
DISPLAYCOUNTS;
while (!(st10taken && st90taken))
{
// v01 = a subset of v0, v0 -= v01, v1 += v01,
// v01 += some words from v1
transferData(v0,v01,v1);
// ta += v01 shuffle, tb += v01 shuffle
treeInsert(ta,v01);
if(genRandom(2)) treeInsert(tb,v01); else treeInsert2(tb,v01);
if (nWords*0.89 <= v1.size() && v1.size() <= nWords*0.91 && !st90taken)
{
ta.takeStat(st90size,st90capa);
nWords90=v1.size();
st90taken = true;
}
DISPLAYCOUNTS;
// v01 = a subset of v1, v1 -= v01, v0 += v01,
// v01 += some words from v0
transferData(v1,v01,v0);
// ta -= v01 shuffle, tb -= v01 shuffle
treeRemove(ta,v01);
if(genRandom(2)) treeRemove(tb,v01); else treeRemove2(tb,v01);
if (nWords*0.09 <= v1.size() && v1.size() <= nWords*0.11 && !st10taken)
{
ta.takeStat(st10size,st10capa);
nWords10=v1.size();
st10taken = true;
}
DISPLAYCOUNTS;
assert(ta.nWords()==int(v1.size()) && tb.nWords()==int(v1.size()));
assert(ta==tb);
assert(ta.isLeafOrCompact() && tb.isLeafOrCompact());
}
// insert the entire v0 to trees, i.e. v01=v0, v0 = emptyset, v1 += v01,
// ta += v01 shuffle, tb += v01 shuffle
v01 = v0;
v0.clear();
v1.insert(v1.end(),v01.begin(),v01.end());
treeInsert(ta,v01);
treeInsert(tb,v01);
DISPLAYCOUNTS;
assert(ta==tb);
assert(ta.isLeafOrCompact() && tb.isLeafOrCompact());
// remove the entire v1 from trees, i.e. v01=v1, v1 = emptyset, v0 += v01,
// ta -= v01 shuffle, tb -= v01 shuffle
v01 = v1;
v1.clear();
v0.insert(v0.end(),v01.begin(),v01.end());
treeRemove(ta,v01);
treeRemove(tb,v01);
DISPLAYCOUNTS;
assert(ta==tb);
assert(ta.nWords()==0 && tb.nWords()==0);
assert(ta.isEmpty() && tb.isEmpty());
assert(ta.nNodes()==1 && tb.nNodes()==1);
assert(ta.isLeaf() && tb.isLeaf());
cout << endl
<< "Testing completed." << endl
<< endl
;
// display the tree size & capacity statistics taken before
cout << endl
<< "Tree size statistic @ " << nWords10
<< " words:" << endl
; listStat(st10size);
cout << endl
<< "Tree capa statistic @ " << nWords10
<< " words:" << endl
; listStat(st10capa);
cout << endl
<< "Tree size statistic @ " << nWords90
<< " words:" << endl
; listStat(st90size);
cout << endl
<< "Tree capa statistic @ " << nWords90
<< " words:" << endl
; listStat(st90capa);
}
int main()
{
testTree();
return 0;
}