00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <assert.h>
00021 #include "idcluster.h"
00022 #include <iostream>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <vector>
00026
00027 using namespace std;
00028
00029
00030 void id_cluster::clear()
00031 {
00032 histo.clear();
00033 total=0;
00034 weighted_sum=0;
00035 id=0;
00036 }
00037
00038 id_cluster_collection::id_cluster_collection(id_cluster_collection::query_flags flags) : flags(flags), version(0)
00039 {
00040 is_idf_normalized=false;
00041 }
00042
00043 void id_cluster_collection::merge_clusters(id_cluster *a, id_cluster *b)
00044 {
00045
00046 for (id_cluster::uumap::iterator it = b->histo.begin(); it!=b->histo.end(); ++it)
00047 a->add(it->first, it->second);
00048 remove_cluster(b);
00049 delete b;
00050
00051
00052 for (id_cluster::uumap::iterator it = a->histo.begin(); it!=a->histo.end(); ++it) {
00053 id2cluster[it->first][a] = (float)a->histo[it->first];
00054 }
00055 is_idf_normalized = false;
00056 version++;
00057 }
00058
00059 id_cluster *id_cluster_collection::get_best_cluster(id_cluster *c, float *dist)
00060 {
00061 id_cluster *r;
00062 cluster_score_map scores;
00063 get_scores(c, scores, &r, dist);
00064
00065 return r;
00066 }
00067
00068 void id_cluster_collection::set_query_rules(id_cluster_collection::query_flags _flags)
00069 {
00070 if (is_idf_normalized && _flags == flags) return;
00071 flags = _flags;
00072 is_idf_normalized = true;
00073
00074 for (cluster_set::iterator it(clusters.begin()); it!=clusters.end(); it++)
00075 {
00076 float * sum = const_cast<float *>(&((*it)->weighted_sum));
00077 *sum = 0;
00078
00079 if ( flags & (QUERY_BIN_FREQ | QUERY_IDF)) {
00080 for (id_cluster::uumap::iterator bin((*it)->histo.begin()); bin != (*it)->histo.end(); ++bin)
00081 {
00082 *sum += ((flags&QUERY_BIN_FREQ) ? 1 : bin->second)
00083 * ((flags&QUERY_IDF) ? idf(bin->first) : 1);
00084 }
00085 } else {
00086 *sum = (*it)->total;
00087 }
00088 }
00089
00090
00091
00092
00093
00094
00095
00096 }
00097
00098 void id_cluster_collection::get_scores(id_cluster *c, cluster_score_map &scores, id_cluster **best_c, float *_best_s)
00099 {
00100 if (best_c) *best_c = 0;
00101 double best_s=0;
00102
00103 set_query_rules(flags);
00104
00105 for (id_cluster::uumap::iterator it = c->histo.begin(); it!=c->histo.end(); ++it)
00106 {
00107 id2cluster_map::iterator id_it = id2cluster.find(it->first);
00108 double w_e = it->second;
00109
00110 if (flags & QUERY_NORMALIZED_FREQ)
00111 w_e = w_e / (double)c->total;
00112
00113 if (flags & QUERY_IDF)
00114 w_e *= idf(id_it);
00115
00116 if (id_it!=id2cluster.end()) {
00117 for (cluster_map::iterator cit=id_it->second.begin(); cit!=id_it->second.end(); ++cit) {
00118 if (cit->first != c) {
00119
00120 double n = cit->second;
00121 if (flags & QUERY_NORMALIZED_FREQ)
00122 n = n / (double)cit->first->total;
00123 assert(n>0);
00124 double p = w_e * n;
00125
00126 cluster_score_map::iterator __i = scores.lower_bound(cit->first);
00127 if (__i == scores.end() || cit->first<(*__i).first) {
00128 scores.insert(__i, pair<id_cluster *, double>(cit->first, p));
00129 } else {
00130 p = __i->second += p;
00131 }
00132 if (best_c && p > best_s) {
00133 best_s = p;
00134 *best_c = cit->first;
00135 }
00136 }
00137 }
00138 }
00139 }
00140
00141 if (best_c) {
00142 if (_best_s) {
00143 *_best_s = best_s;
00144 assert(*_best_s>=0 && *_best_s<=1);
00145 }
00146 }
00147 }
00148
00149 void id_cluster_collection::add_cluster(id_cluster *c)
00150 {
00151 version++;
00152 is_idf_normalized=false;
00153
00154 if (clusters.insert(c).second) c->id = clusters.size();
00155 assert(c->id!=0);
00156
00157
00158 for (id_cluster::uumap::iterator it = c->histo.begin(); it!=c->histo.end(); ++it) {
00159 id2cluster[it->first][c] = (float)c->histo[it->first];
00160
00161 }
00162 }
00163
00164 float id_cluster::dotprod(const id_cluster &a) const
00165 {
00166 if (total==0 || a.total==0) return 0;
00167
00168 double dp=0;
00169 uumap::const_iterator it1 = histo.begin();
00170 uumap::const_iterator it2 = a.histo.begin();
00171
00172 assert(it1!=histo.end() && it2!=a.histo.end());
00173
00174 while (1) {
00175 if (it1->first < it2->first) {
00176 it1 = histo.lower_bound(it2->first);
00177 if (it1 == histo.end()) break;
00178 } else if (it2->first < it1->first) {
00179 it2 = a.histo.lower_bound(it1->first);
00180 if (it2 == a.histo.end()) break;
00181 } else {
00182 dp += (double)it1->second/(double)total * (double)it2->second/(double)a.total;
00183 ++it1;
00184 if (it1 == histo.end()) break;
00185 ++it2;
00186 if (it2 == a.histo.end()) break;
00187 }
00188 }
00189 return dp/(total*a.total);
00190
00191 }
00192
00193 unsigned id_cluster::add(unsigned id, int amount)
00194 {
00195 if (amount == 0) return 0;
00196 uumap::iterator hbin = histo.lower_bound(id);
00197
00198 int tot = (int)total + amount;
00199 assert(tot>=0);
00200 total = (unsigned)tot;
00201
00202 if (hbin == histo.end() || hbin->first != id) {
00203 assert(amount>0);
00204 histo.insert(hbin, pair<unsigned,unsigned>(id,amount));
00205 return amount;
00206 } else {
00207 if ((int)hbin->second + amount <= 0) {
00208
00209 histo.erase(hbin);
00210 return 0;
00211 } else {
00212 hbin->second += amount;
00213 return hbin->second;
00214 }
00215 }
00216 }
00217
00218 unsigned id_cluster::get_freq(unsigned id)
00219 {
00220 id_cluster::uumap::iterator r = histo.find(id);
00221 if (r == histo.end())
00222 return 0;
00223 return r->second;
00224 }
00225
00226 void id_cluster::print() const {
00227 for (id_cluster::uumap::const_iterator id_it(histo.begin()); id_it != histo.end(); ++id_it) {
00228 cout << "\t" << id_it->first <<":" << id_it->second
00229 << "(" << (float)id_it->second/(float)total << ")\n";
00230 }
00231 }
00232
00233 void id_cluster_collection::print()
00234 {
00235 unsigned non_ambig=0;
00236 cout << "Ambiguities:\n";
00237 for (id2cluster_map::iterator it(id2cluster.begin()); it!=id2cluster.end(); ++it)
00238 {
00239 int n=0;
00240 for (cluster_map::iterator cit=it->second.begin(); cit!=it->second.end(); ++cit) {
00241 float p = cit->second;
00242 if (p>.1) n++;
00243 }
00244 if (it->second.size()>0) {
00245 cout << " " << it->second.size() << "("<<n<<") ";
00246 if (it->second.size()==1) non_ambig++;
00247 }
00248 }
00249 cout << "\n.. and " << non_ambig << " non-ambiguous nodes, over " << id2cluster.size()
00250 << ". Total number of clusters: " << clusters.size()
00251 << endl;
00252 }
00253
00254 bool id_cluster_collection::save(const char *fn)
00255 {
00256 FILE *f = fopen(fn,"wb");
00257 if (!f) {
00258 perror(fn);
00259 return false;
00260 }
00261 bool r= save(f);
00262 fclose(f);
00263 return r;
00264 }
00265
00266 bool id_cluster_collection::save(FILE *f)
00267 {
00268 unsigned m1 = 0xFFFFFFFF;
00269 bool err=false;
00270 int n=0;
00271 for (cluster_set::iterator it=clusters.begin(); !err && it!=clusters.end(); ++it) {
00272 if (fwrite(&m1, sizeof(unsigned), 1, f) != 1) { err=true; break; }
00273 n++;
00274
00275
00276 for (id_cluster::uumap::iterator id_it((*it)->histo.begin()); id_it != (*it)->histo.end(); ++id_it)
00277 {
00278 if (fwrite(&id_it->first, sizeof(unsigned), 1, f)!=1) { err=true; break; }
00279 if (fwrite(&id_it->second, sizeof(unsigned), 1, f)!=1) { err=true; break; }
00280 }
00281 }
00282
00283 return !err;
00284 }
00285
00286 bool id_cluster_collection::save(sqlite3 *db, const char *tablename)
00287 {
00288 if (db==0) return false;
00289 int rc = sqlite3_exec(db,"begin",0,0,0);
00290 assert(rc==0);
00291
00292 const char *table = (tablename ? tablename : "clusters");
00293 char q[1024];
00294 sprintf(q, "drop table if exists %s;\n"
00295 "create table %s ("
00296 "cid_val integer primary key,"
00297 "cnt integer);\n", table, table);
00298 char *errmsg=0;
00299 rc=sqlite3_exec(db,q,0,0,&errmsg);
00300 if (rc) {
00301 cerr << table << ": can't create/clear table: rc=" << rc << endl << sqlite3_errmsg(db) << endl;
00302 cerr <<"Query: " << q << endl;
00303 if (errmsg) cerr << errmsg << endl;
00304 sqlite3_exec(db,"rollback",0,0,0);
00305 return false;
00306 }
00307
00308 sqlite3_stmt *insert;
00309 sprintf(q,"insert into %s (cid_val, cnt) values (?,?)", table);
00310 rc=sqlite3_prepare_v2(db, q, -1, &insert, 0);
00311 assert(rc == 0);
00312
00313 for (cluster_set::iterator it=clusters.begin(); it!=clusters.end(); ++it) {
00314
00315 for (id_cluster::uumap::iterator id_it((*it)->histo.begin()); id_it != (*it)->histo.end(); ++id_it)
00316 {
00317 sqlite3_int64 cid_val = (*it)->id;
00318 cid_val = (cid_val << 32) | id_it->first;
00319 sqlite3_bind_int64(insert, 1, cid_val);
00320 sqlite3_bind_int(insert, 2, id_it->second);
00321 if (sqlite3_step(insert) != SQLITE_DONE) {
00322 cerr << "error while inserting rows: " << sqlite3_errmsg(db)<< endl;
00323 sqlite3_free(errmsg);
00324 sqlite3_finalize(insert);
00325 sqlite3_exec(db,"rollback",0,0,0);
00326 return false;
00327 }
00328 sqlite3_reset(insert);
00329 }
00330 }
00331 sqlite3_exec(db,"commit",0,0,0);
00332 sqlite3_finalize(insert);
00333
00334 return true;
00335 }
00336
00337 bool id_cluster_collection::load(sqlite3 *db, const char *tablename)
00338 {
00339 if (db==0) return false;
00340 const char *table = (tablename ? tablename : "clusters");
00341 char q[1024];
00342 sprintf(q, "select cid_val,cnt from %s", table);
00343 sqlite3_stmt *select;
00344 int rc =sqlite3_prepare_v2(db, q, -1, &select, 0);
00345 if (rc) {
00346 cerr << "DB error: " << sqlite3_errmsg(db)<< endl;
00347 return false;
00348 }
00349
00350 std::vector<id_cluster *> clusters;
00351 while (sqlite3_step(select) == SQLITE_ROW) {
00352 sqlite3_int64 cid_val = sqlite3_column_int64(select, 0);
00353 unsigned cid = cid_val >> 32;
00354 unsigned val = cid_val & 0xffffffff;
00355 unsigned cnt = sqlite3_column_int(select, 1);
00356
00357 assert(cnt!=0);
00358
00359 if (cid >= clusters.size()) {
00360 clusters.insert(clusters.end(), cid+1-clusters.size(), 0);
00361 }
00362 id_cluster *c;
00363 if (clusters[cid] == 0)
00364 c = clusters[cid] = new id_cluster;
00365 else
00366 c = clusters[cid];
00367
00368 c->add(val,cnt);
00369 }
00370 sqlite3_finalize(select);
00371
00372 for (int i=1; i<clusters.size(); i++) {
00373 assert(clusters[i]);
00374 assert(clusters[i]->total>0);
00375 add_cluster(clusters[i]);
00376 }
00377
00378 is_idf_normalized = false;
00379 cmp_best_clusters();
00380 return true;
00381 }
00382
00383 bool id_cluster_collection::load(const char *fn)
00384 {
00385 FILE *f = fopen(fn,"rb");
00386 if (!f) {
00387 perror(fn);
00388 return false;
00389 }
00390 version++;
00391
00392 id_cluster *c = 0;
00393
00394 unsigned val;
00395 bool err=false;
00396 int n=0;
00397 while (fread(&val, sizeof(unsigned), 1, f)==1) {
00398 if (val==0xFFFFFFFF) {
00399 if (c) {
00400 if (c->total==0) { err=true; break; }
00401 n++;
00402 add_cluster(c);
00403 }
00404 c = new id_cluster();
00405 } else {
00406 if (!c) { err=true; break; }
00407 unsigned cnt;
00408 if (fread(&cnt, sizeof(unsigned), 1, f) != 1) { err=true; break; }
00409 if (cnt==0) {err=true; break;}
00410 c->add(val,cnt);
00411 }
00412 }
00413
00414 if (!err) {
00415 if (c->total==0) err=true;
00416 else {
00417 n++;
00418 add_cluster(c);
00419 }
00420 }
00421 if (err) {
00422 cerr << fn << ": file format error\n";
00423 fclose(f);
00424 return false;
00425 }
00426
00427
00428 fclose(f);
00429 is_idf_normalized = false;
00430 cmp_best_clusters();
00431 return true;
00432 }
00433
00434 id_cluster_collection::~id_cluster_collection()
00435 {
00436 for (cluster_set::iterator it=clusters.begin(); it!=clusters.end(); ++it)
00437 delete *it;
00438 }
00439
00440 void id_cluster_collection::cmp_best_clusters()
00441 {
00442 best_cluster.clear();
00443 for (id2cluster_map::iterator it(id2cluster.begin()); it!=id2cluster.end(); ++it)
00444 {
00445 float best_p=0;
00446 id_cluster *best_c=0;
00447
00448 for (cluster_map::iterator cit=it->second.begin(); cit!=it->second.end(); ++cit) {
00449 float p = cit->second;
00450
00451 if (p>best_p) {
00452 best_p=p;
00453 best_c=cit->first;
00454 }
00455 }
00456 best_cluster[it->first] = best_c;
00457 }
00458 }
00459
00460 unsigned id_cluster_collection::get_best_cluster(unsigned id)
00461 {
00462 map<unsigned, id_cluster *>::iterator it = best_cluster.find(id);
00463 if (it != best_cluster.end()) return it->second->id;
00464 return 0;
00465 }
00466
00467
00468 void id_cluster_collection::reduce(float threshold)
00469 {
00470
00471
00472
00473
00474
00475
00476
00477
00478 cout << "Building initial distance matrix...\n";
00479 build_distance_matrix(threshold);
00480 cout << "\ndone. ";
00481
00482 unsigned start = clusters.size();
00483 unsigned merged=0;
00484 cout << "Starting clustering...\n";
00485 do {
00486
00487 set<cluster_dist_t>::reverse_iterator closest_it = distance_matrix.rbegin();
00488
00489 if (closest_it == distance_matrix.rend()) {
00490 cout << "Empty distance matrix.\n";
00491 break;
00492 }
00493 cluster_dist_t closest = *closest_it;
00494
00495
00496 if (closest.d < threshold) {
00497 cout << "highest dotprod is " << closest.d << ", stopping clustering.\n";
00498 break;
00499 }
00500 merged++;
00501
00502
00503
00504
00505 remove_from_distance_matrix(closest.a);
00506 remove_from_distance_matrix(closest.b);
00507
00508
00509 merge_clusters(closest.a, closest.b);
00510
00511 add_to_distance_matrix(closest.a, threshold);
00512 } while(1);
00513 cout << merged << " merges. " << clusters.size() << " clusters left, over " << start << endl;
00514 version++;
00515
00516 int i=0;
00517 for (cluster_set::iterator it(clusters.begin()); it!=clusters.end(); ++it)
00518 (*it)->id = ++i;
00519 }
00520
00521 void id_cluster_collection::remove_cluster(id_cluster *c)
00522 {
00523 cluster_set::iterator it = clusters.find(c);
00524 if (it != clusters.end()) {
00525 clusters.erase(it);
00526
00527
00528 for (id_cluster::uumap::iterator i = c->histo.begin(); i!=c->histo.end(); ++i) {
00529 id2cluster_map::iterator id2c( id2cluster.find(i->first) );
00530
00531 id2c->second.erase(c);
00532 if (id2c->second.size() == 0)
00533 id2cluster.erase(id2c);
00534 }
00535 }
00536 version++;
00537 }
00538
00539 void id_cluster_collection::update_cluster(id_cluster *c, unsigned id, int amount)
00540 {
00541 if (amount==0) return;
00542
00543 int val = c->add(id, amount);
00544
00545 if ( flags & (QUERY_BIN_FREQ | QUERY_IDF)) is_idf_normalized=false;
00546 else c->weighted_sum = c->total;
00547 version++;
00548
00549 if (clusters.find(c) != clusters.end()) {
00550
00551
00552 if (val==0) {
00553
00554 id2cluster_map::iterator id2c( id2cluster.find(id) );
00555 if (id2c != id2cluster.end()) {
00556
00557 id2c->second.erase(c);
00558 }
00559 } else {
00560
00561
00562 id2cluster[id][c]=val;
00563 }
00564 }
00565 }
00566
00567 void id_cluster_collection::build_distance_matrix(float t)
00568 {
00569 distance_matrix.clear();
00570 int n=0;
00571 float tot = clusters.size();
00572 for (cluster_set::iterator i=clusters.begin(); i!=clusters.end(); ++i) {
00573 add_to_distance_matrix(*i, t);
00574 printf("% 3d%% - % 8d elements\r", (int)(100.0f* (++n)/tot), (int)distance_matrix.size());
00575 fflush(stdout);
00576 }
00577 }
00578
00579 void id_cluster_collection::remove_from_distance_matrix(id_cluster *c)
00580 {
00581 cluster_score_map similar;
00582 get_scores(c, similar);
00583
00584 for (cluster_score_map::iterator i=similar.begin(); i!=similar.end(); ++i) {
00585 cluster_dist_t cd(i->first, c, i->second);
00586 distance_matrix.erase(cd);
00587 }
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616
00617
00618
00619
00620
00621
00622
00623 }
00624
00625 void id_cluster_collection::add_to_distance_matrix(id_cluster *c, float threshold)
00626 {
00627 cluster_score_map similar;
00628 get_scores(c, similar);
00629
00630 for (cluster_score_map::iterator i=similar.begin(); i!=similar.end(); ++i) {
00631 cluster_dist_t cd(i->first, c, i->second);
00632 if (cd.d > threshold)
00633 distance_matrix.insert(cd);
00634 }
00635
00636 }
00637
00638
00639 id_cluster_collection::cluster_dist_t::cluster_dist_t(id_cluster *_a, id_cluster *_b, float _d)
00640 {
00641 assert(_a!=_b);
00642 if (_a < _b) {
00643 a = _a;
00644 b = _b;
00645 } else {
00646 a=_b;
00647 b=_a;
00648 }
00649
00650 d = _d;
00651 }
00652
00653 bool id_cluster_collection::cluster_dist_t::operator < (const cluster_dist_t &c) const
00654 {
00655 if (d == c.d) {
00656
00657 if (a < c.a) return true;
00658 else {
00659 if (a==c.a)
00660 return b < c.b;
00661 else
00662 return false;
00663 }
00664 }
00665 if (d<c.d) return true;
00666 return false;
00667 }
00668
00669 void id_cluster_collection::clear()
00670 {
00671 for (cluster_set::iterator it(clusters.begin()); it!=clusters.end(); it++)
00672 delete *it;
00673 clusters.clear();
00674 id2cluster.clear();
00675 best_cluster.clear();
00676 distance_matrix.clear();
00677 version++;
00678 }
00679
00680 float id_cluster_collection::idf(id_cluster_collection::id2cluster_map::iterator &id_it)
00681 {
00682 if (clusters.size()<2) return 1;
00683 return logf(clusters.size()/id_it->second.size());
00684 }
00685
00686 float id_cluster_collection::idf(unsigned id)
00687 {
00688 id2cluster_map::iterator id_it = id2cluster.find(id);
00689 if (id_it == id2cluster.end()) return 0;
00690 return idf(id_it);
00691 }
00692
00693 incremental_query::incremental_query(id_cluster_collection *db)
00694 : database(db)
00695 {
00696 if (db) version=db->version;
00697 }
00698
00699 void incremental_query::clear() {
00700 results.clear();
00701 query_cluster.clear();
00702 scores.clear();
00703 if (database) version=database->version;
00704 }
00705
00706
00707
00708 void incremental_query::modify(unsigned id, int amount)
00709 {
00710 if (!database) return;
00711 if (amount ==0) return;
00712
00713
00714
00715
00716 unsigned amount_to_add = query_cluster.add(id,amount);
00717 unsigned amount_to_remove = amount_to_add - amount;
00718
00719 id_cluster_collection::id2cluster_map::iterator id_it = database->id2cluster.find(id);
00720 if (id_it==database->id2cluster.end())
00721 return;
00722 float idf = 1;
00723
00724 if (database->flags & id_cluster_collection::QUERY_IDF)
00725 idf = database->idf(id_it);
00726
00727 database->set_query_rules(database->flags);
00728
00729
00730 for (id_cluster_collection::cluster_map::iterator cit=id_it->second.begin();
00731 cit!=id_it->second.end(); ++cit)
00732 {
00733 float idf_n = idf;
00734
00735 if (database->flags & id_cluster_collection::QUERY_NORMALIZED_FREQ)
00736 idf_n = idf/cit->first->weighted_sum;
00737
00738 float score_to_add;
00739 float score_to_remove;
00740
00741 float tf = cit->second;
00742 if (database->flags & id_cluster_collection::QUERY_BIN_FREQ) {
00743 amount_to_remove = min(amount_to_remove, (unsigned)1);
00744 amount_to_add = min(amount_to_add, (unsigned)1);
00745 tf = min(tf,1.0f);
00746 }
00747
00748 if (database->flags & id_cluster_collection::QUERY_MIN_FREQ) {
00749 score_to_remove = min(tf, (float)amount_to_remove) * idf_n;
00750 score_to_add = min(tf, (float)amount_to_add) * idf_n;
00751 } else {
00752 score_to_add = amount_to_add * tf * idf_n;
00753 score_to_remove = amount_to_remove * tf * idf_n;
00754 }
00755
00756 cluster_score_map::iterator __i = scores.lower_bound(cit->first);
00757
00758 float s;
00759 if (__i == scores.end() || cit->first<(*__i).first) {
00760
00761 s = score_to_add;
00762 if (s>0) scores.insert(__i, pair<id_cluster *, double>(cit->first, s));
00763
00764 } else {
00765 s = __i->second + score_to_add - score_to_remove;
00766 __i->second= s;
00767 }
00768 }
00769 }
00770
00771 incremental_query::iterator incremental_query::sort_results(unsigned max_results)
00772 {
00773 if (!database) return end();
00774 results.clear();
00775 for (cluster_score_map::iterator it(scores.begin()); it!=scores.end(); ++it)
00776 {
00777 float s = it->second;
00778 if (database->flags & id_cluster_collection::QUERY_NORMALIZED_FREQ) {
00779 if ((database->flags & (id_cluster_collection::QUERY_MIN_FREQ | id_cluster_collection::QUERY_BIN_FREQ)) ==0)
00780 s = s/(float)query_cluster.total;
00781 }
00782
00783 if (results.size() < max_results) {
00784 results.insert(ranked_cluster(it->first, s));
00785 } else if (results.rbegin()->score < s) {
00786 std::set< ranked_cluster >::iterator last(results.end());
00787 --last;
00788 results.erase(last);
00789 results.insert(ranked_cluster(it->first, s));
00790 }
00791 }
00792
00793
00794
00795
00796
00797
00798
00799 return begin();
00800 }
00801
00802 incremental_query::iterator incremental_query::sort_results_min_ratio(float ratio)
00803 {
00804 if (!database) return end();
00805 results.clear();
00806 for (cluster_score_map::iterator it(scores.begin()); it!=scores.end(); ++it)
00807 {
00808 float s = it->second;
00809 if (database->flags & id_cluster_collection::QUERY_NORMALIZED_FREQ) {
00810 if ((database->flags & (id_cluster_collection::QUERY_MIN_FREQ | id_cluster_collection::QUERY_BIN_FREQ)) ==0)
00811 s = s/(float)query_cluster.total;
00812 }
00813
00814 iterator first = begin();
00815 if (first == end()) {
00816 results.insert(ranked_cluster(it->first, s));
00817 } else if (first->score*ratio < s) {
00818 results.insert(ranked_cluster(it->first, s));
00819 }
00820 }
00821
00822 iterator first = begin();
00823 if (first != end()) {
00824 ranked_cluster limit(0,first->score * ratio);
00825 results.erase(lower_bound(begin(),end(), limit), end());
00826 }
00827 return begin();
00828 }
00829
00830 void incremental_query::set(id_cluster *q)
00831 {
00832 if (!database) return;
00833 id_cluster::uumap::iterator goal(q->histo.begin());
00834 id_cluster::uumap::iterator actual(query_cluster.histo.begin());
00835
00836
00837 if (version!= database->version) {
00838
00839 clear();
00840 for (goal = q->histo.begin(); goal != q->histo.end(); ++goal)
00841 modify(goal->first, goal->second);
00842 return;
00843 }
00844
00845 unsigned modifs=0;
00846 unsigned q_size = q->histo.size();
00847 vecmap<unsigned,int> modif_map;
00848 modif_map.reserve(q_size+ query_cluster.histo.size());
00849
00850 while (goal!=q->histo.end() && actual!=query_cluster.histo.end()) {
00851
00852 if (actual->first == goal->first) {
00853 if (goal->second != actual->second) {
00854 modif_map[actual->first] = goal->second - actual->second;
00855 }
00856 ++actual;
00857 ++goal;
00858 } else if (actual->first < goal->first) {
00859 modif_map[actual->first] = -actual->second;
00860 ++actual;
00861 } else {
00862 modif_map[goal->first] = goal->second;
00863 ++goal;
00864 }
00865 }
00866 while (goal != q->histo.end()) {
00867 modif_map[goal->first] = goal->second;
00868 ++goal;
00869 }
00870 while (actual != query_cluster.histo.end()) {
00871 modif_map[actual->first] = -actual->second;
00872 ++actual;
00873 }
00874
00875 if (modifs>= q_size) {
00876
00877 clear();
00878 for (goal = q->histo.begin(); goal != q->histo.end(); ++goal)
00879 modify(goal->first, goal->second);
00880 } else {
00881
00882 for (vecmap<unsigned,int>::iterator it(modif_map.begin()); it!=modif_map.end(); ++it)
00883 modify(it->first, it->second);
00884 }
00885 }
00886
00887 id_cluster *incremental_query::get_best(float *score)
00888 {
00889 if (!database) return 0;
00890 sort_results(5);
00891 std::set< ranked_cluster >::iterator it = results.begin();
00892 if (it!=results.end()) {
00893 if (score) *score = it->score;
00894 return it->c;
00895 }
00896 return 0;
00897 }
00898