使用C++实现mmseg,对中文句子进行分词(三)

2014-11-24 12:17:24 · 作者: · 浏览: 3
(lexIt == content.end()) {
tempChunk.addWord(tempWord);
string remain = src.substr(ChineseLength);
mmseg_recursion(remain, tempChunk);
tempChunk.list.pop_back();
} else {
termslist = content[singleWordStr];
set::iterator setIt;
vector termsVector;
// for (setIt = termslist.begin(); setIt != termslist.end(); setIt++) {
// termsVector.push_back(*setIt);
// }
// sort(termsVector.begin(), termsVector.end());
// int sizeVec = termsVector.size();
for (setIt = termslist.begin(); setIt != termslist.end(); setIt++) {
tempStr = *setIt;
size_t foundit = src.find(tempStr, 0);
if (foundit == 0 && tempStr != singleWordStr) {
tempWord.setValue(tempStr);
tempChunk.addWord(tempWord);
if (tempChunk.getWordNumber() > (minChunkWordNumber)) {
tempChunk.list.pop_back();
return;
}

//if the term has all remain character of string
//return
if (tempStr.length() == src.length()) {
chunklist.push_back(tempChunk);
if (minChunkWordNumber > tempChunk.getWordNumber()) {
minChunkWordNumber = tempChunk.getWordNumber();
}
return;
}
string remain = src.substr(tempStr.length());
mmseg_recursion(remain, tempChunk);
tempChunk.list.pop_back();
}
}
//process the single character situation
tempStr = singleWordStr;
tempWord.setValue(tempStr);
tempChunk.addWord(tempWord);
if (tempChunk.getWordNumber() > (minChunkWordNumber)) {
tempChunk.list.pop_back();
return;
}
string remain = src.substr(tempStr.length());
mmseg_recursion(remain, tempChunk);
tempChunk.list.pop_back();

}
}
}

vector mmseg(string src) {
vector res;
chunklist.clear();
minChunkWordNumber = 0x7ffffff0;
Chunk tempChunk;
vector indexInChunkList;
int min = 0x7fffffff;
// cout << min;
mmseg_recursion(src, tempChunk);
int chunkListSize = chunklist.size();
if (chunkListSize == 1) {
return chunklist.at(0).getVectorString();
} else {
for (int i = 0; i < chunkListSize; i++) {
if (chunklist.at(i).getWordNumber() < min) {
min = chunklist.at(i).getWordNumber();
indexInChunkList.clear();
indexInChunkList.push_back(i);
} else if (chunklist.at(i).getWordNumber() == min) {
indexInChunkList.push_back(i);
}
}
//rule 1 to find the max average length chunk
if (indexInChunkList.size() == 1) {
return chunklist.at(indexInChunkList.at(0)).getVectorString();
} else {
//rule 2 find the least variance of chunk
double minVariance = min *