使用C++实现mmseg,对中文句子进行分词(四)

2014-11-24 12:17:24 · 作者: · 浏览: 2
src.length() * src.length();
vector tempIndex = indexInChunkList;
indexInChunkList.clear();
for (size_t index = 0; index < tempIndex.size(); index++) {
int i = tempIndex.at(index);
if (chunklist.at(i).getVariance() < minVariance) {
minVariance = chunklist.at(i).getVariance();
indexInChunkList.clear();
indexInChunkList.push_back(i);
} else if (chunklist.at(i).getVariance() == minVariance) {
indexInChunkList.push_back(i);
}
}

if (indexInChunkList.size() == 1) {
return chunklist.at(indexInChunkList.at(0)).getVectorString();
} else {
//rule 3 have most frequency terms
vector tempIndex = indexInChunkList;
indexInChunkList.clear();
long max = 0;
int tempIndexSize = tempIndex.size();
for (int index = 0; index < tempIndexSize; index++) {
int i = tempIndex.at(index);
if (chunklist.at(i).getFreq() > max) {
max = chunklist.at(i).getFreq();
indexInChunkList.clear();
indexInChunkList.push_back(i);
} else if (chunklist.at(i).getFreq() == max) {
indexInChunkList.push_back(i);
}
}
return chunklist.at(indexInChunkList.at(0)).getVectorString();
}
}
}
}
void showTermsSegment(vector src) {
cout << "segment like this:";
int size = src.size();
for (int i = 0; i < size; i++) {
cout << src.at(i) << " ";
}
cout << endl;
}
int main() {
initial();
// read_terms_from_Lexicon();
// write_index();
build_index();
// show_Lexicon();
string test = "中华人民共和国在1949年建立";
test = "从此开始了新中国的伟大篇章";
test = "研究生命起源";
test = "北京天安门";
// 从此开始了新中国的伟大篇章中华人民共和国在一九四九年建立
test = "主要是因为研究生命起源北京天安门";
test = "从此开始了新中国的伟大篇章中华人民共和国在一九五五年建立主要是因为研究生命起源北京天安门";

// test ="国际化企业中华人民共和国";
// size_t found;
// found = test.find("开始", 10);
// bool flag = (found != string::npos);
// cout << test.substr(test.length(), 3) << endl;
// cout << test.substr(24, 4) << endl;
// test = ",";
// cout << test.length();

// vector res = mmseg(test);
// int min = 0x7fffffff;
// cout << min;
vector seg = mmseg(test);
// seg.push_back("从");
// sort(seg.begin(), seg.end());
cout << endl;
cout << "test string :" << test << endl;
showTermsSegment(seg);

test = "主要是因为研究生死 www.2cto.com";
seg = mmseg(test);
cout << endl;
cout << "test string :" << test << endl;
showTermsSegment(seg);

return 0;
}


摘自 laiconglin的专栏