使用C++实现mmseg,对中文句子进行分词(二)

2014-11-24 12:17:24 · 作者: · 浏览: 1
] = tempSet;

}
cout << "finish read the lexicon." << endl;
cout << "finish read the lexicon .lexicon size:" << content.size() << endl;

fin.close();
}
void show_Lexicon() {
map >::iterator lexiconIterator;
int count = 0;
for (lexiconIterator = content.begin(); lexiconIterator != content.end();
lexiconIterator++) {
string first = lexiconIterator->first;
set second = lexiconIterator->second;
cout << first << ":";
set::iterator setIt;
for (setIt = second.begin(); setIt != second.end(); setIt++) {
cout << *setIt << " ";
}
cout << endl;
count++;
if (count == 10) {
break;
}
}
cout << "lexicon size:" << content.size() << endl;
}
void write_index() {
fstream outputFile("lexicon.index", fstream::out);
map >::iterator lexiconIterator;
for (lexiconIterator = content.begin(); lexiconIterator != content.end();
lexiconIterator++) {
string first = lexiconIterator->first;
set second = lexiconIterator->second;
outputFile << "#:" << first << endl;
set::iterator setIt;
for (setIt = second.begin(); setIt != second.end(); setIt++) {
outputFile << *setIt << " ";
}
outputFile << endl;
}
cout << "write index: lexicon size:" << content.size() << endl;
outputFile.close();
}
void write_freq() {
fstream outputFile("freq.index", fstream::out);
map::iterator freqIterator;
for (freqIterator = freq.begin(); freqIterator != freq.end();
freqIterator++) {
string first = freqIterator->first;
int second = freqIterator->second;
outputFile << first << " " << second;

outputFile << endl;
}
cout << "write index: freq size:" << freq.size() << endl;
outputFile.close();

}
void build_freq() {
ifstream fin("freq.index");
string tempStr;
int tempFreq;
while (fin >> tempStr) {
fin >> tempFreq;
if (tempFreq > 1 || tempStr.length() == ChineseLength) {
freq[tempStr] = tempFreq;
}
}
fin.close();
// write_freq();
cout << "index: freq size:" << freq.size() << endl;
}

void build_index() {
ifstream fin("lexicon.index");
// cout< if (fin.good() == 0) {
cout << "build index , need some time , please wait for a moment! \n";
read_terms_from_Lexicon();
write_index();
fin.open("lexicon.index", ifstream::in);
}
cout << "hello ,begin load index \n";
string tempStr;
set tempSet;
string key = "";
while (fin >> tempStr) {
if (tempStr.find("#:", 0) == 0) {
if (key != "")
content[key] = tempSet;
tempSet.clear();
key = tempStr.substr(2);
} else {
tempSet.insert(tempStr);
}
// if(content.size()>10){
// break;
// }
}
content[key] = tempSet;
cout << "lexicon size:" << content.size() << endl;
fin.close();
build_freq();
}

vector chunklist;
int minChunkWordNumber;
void mmseg_recursion(string src, Chunk tempChunk) {
set termslist;
int i = 0;
int len = src.length();
//get the single character.
string singleWordStr;
Word tempWord;
singleWordStr = src.substr(0, ChineseLength);
tempWord.setValue(singleWordStr);

//if the character is the end character , return
if ((i + ChineseLength) >= len) {
tempChunk.addWord(tempWord);
chunklist.push_back(tempChunk);
if (minChunkWordNumber > tempChunk.getWordNumber()) {
minChunkWordNumber = tempChunk.getWordNumber();
}
return;
} else {
string tempStr;
map >::iterator lexIt;
lexIt = content.find(singleWordStr);
//if the lexicon has no this word
//them let it be a single term
if