Package cornetto :: Module simcornet
[hide private]
[frames] | no frames]

Source Code for Module cornetto.simcornet

  1  # -*- coding: utf-8 -*- 
  2   
  3  # Copyright (C) 2008 by  
  4  # Erwin Marsi and Tilburg University 
  5   
  6   
  7  # This file is part of the Pycornetto package. 
  8   
  9  # Pycornetto is free software; you can redistribute it and/or modify 
 10  # it under the terms of the GNU General Public License as published by 
 11  # the Free Software Foundation; either version 3 of the License, or 
 12  # (at your option) any later version. 
 13   
 14  # Pycornetto is distributed in the hope that it will be useful, 
 15  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 16  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 17  # GNU General Public License for more details. 
 18   
 19  # You should have received a copy of the GNU General Public License 
 20  # along with this program.  If not, see <http://www.gnu.org/licenses/>. 
 21   
 22  """ 
 23  an extension of the Cornet class which adds word similarity measures 
 24  """ 
 25   
 26  # TODO: 
 27  # - other similarity measures based on path lenght 
 28  # - units tests 
 29   
 30   
 31  __author__ = 'Erwin Marsi <e.c.marsi@uvt.nl>' 
 32  __revision__ = '$Id: cornetto.simcornet-pysrc.html 1149 2009-11-16 12:18:15Z emarsi $' 
 33  __version__ = '0.5' 
 34   
 35  from math import log 
 36   
 37  from cornetto.parse import parse_cdb_with_counts 
 38  from cornetto.cornet import Cornet 
 39   
 40   
41 -class SimCornet(Cornet):
42 """ 43 An extension of the Cornet class which adds word similarity measures. 44 This assumes counts are added to the Cornetto database. 45 """ 46 47 # ------------------------------------------------------------------------------ 48 # Public methods 49 # ------------------------------------------------------------------------------ 50
51 - def open(self, cdb_lu, cdb_syn, verbose=False):
52 """ 53 Open and parse Cornetto database files with counts 54 55 @param cdb_lu: xml definition of the lexical units with counts 56 @type cdb_lu: file or filename 57 @param cdb_syn: xml definition of the synsets 58 @type cdb_syn: file or filename 59 @keyword verbose: verbose output during parsing 60 """ 61 ( self._form2lu, 62 self._c_lu_id2lu, 63 self._c_sy_id2synset, 64 self._graph, 65 self._cat2counts ) = parse_cdb_with_counts(cdb_lu, cdb_syn, verbose)
66 67 68 # counts 69
70 - def get_count(self, lu_spec, subcount=False, format=None):
71 """ 72 Get (sub)counts for lexical units satisfying this specification 73 74 >>> c.get_counts("varen") 75 {'varen:noun:1': 434, 76 'varen:verb:1': 15803, 77 'varen:verb:2': 15803, 78 'varen:verb:3': 15803} 79 80 >>> pprint(c.get_counts("varen", subcount=True)) 81 {'varen:noun:1': 434, 82 'varen:verb:1': 18977, 83 'varen:verb:2': 62086, 84 'varen:verb:3': 15803} 85 86 @param lu_spec: lexical unit specification 87 88 @keyword subcount: return subcounts instead of plain counts 89 @type subcount: bool 90 91 @keyword format: output format 92 @type format: 'spec', 'xml', 'raw' 93 94 @return: mapping of lexical units in requested output format 95 to (sub)counts 96 @rtype: dict 97 98 @note: As the counts are currently based on lemma plus part-of-speech, 99 not on the sense, they are the same for all senses of the 100 same category, 101 """ 102 formatter = self._get_lex_unit_formatter(format) 103 lu2count = dict() 104 105 for lu in self.get_lex_units(lu_spec, "raw"): 106 lu2count[formatter(lu)] = self._get_lu_count(lu, subcount) 107 108 return lu2count
109 110
111 - def get_total_counts(self):
112 """ 113 Get the total counts per category and overall 114 115 The categories are "noun", "verb", "adj", "other"; 116 "all" represents the overall count. 117 118 >>> c.get_total_counts() 119 {'adj': 62156445, 120 'all': 518291832, 121 'noun': 187143322, 122 'other': 199269966, 123 'verb': 69722099} 124 125 @return: mapping of categories to counts 126 @rtype: dict 127 """ 128 return self._cat2counts
129 130 131 # statistics 132
133 - def get_probability(self, lu_spec, subcount=False, smooth=False, 134 cat_totals=False, format=None):
135 """ 136 Get probability (p) for lexical units satisfying this specification, 137 where the probability is defined as lu_count / total_count. 138 139 By default, the total count is taken to be the sum of counts over all 140 word forms in the Cornetto database. However, when comparing two words 141 of the same category (nouns, verbs, adjectives) it may be more 142 appropriate to take the sum over only the word forms of this category. 143 This method is used if the keyword "cat_totals" is true. 144 145 >>> c.get_probabilities("varen") 146 {'varen:noun:1': 8.3736608066013281e-07, 147 'varen:verb:1': 3.0490544176663777e-05, 148 'varen:verb:2': 3.0490544176663777e-05, 149 'varen:verb:3': 3.0490544176663777e-05} 150 151 @param lu_spec: lexical unit specification 152 @type lu_spec: string 153 154 @keyword subcount: use subcounts instead of plain counts 155 @type subcount: bool 156 157 @keyword smooth: smooth counts by adding one to lexical units 158 with a zero count 159 @type smooth: bool 160 161 @keyword cat_totals: use total count for category of lexical unit 162 instead of overall total count 163 @type cat_totals: bool 164 165 @keyword format: output format 166 @type format: 'spec', 'xml', 'raw' 167 168 @return: mapping of lexical units in requested output format 169 to probabilties 170 @rtype: dict 171 """ 172 formatter = self._get_lex_unit_formatter(format) 173 lu2prob = {} 174 175 for lu in self.get_lex_units(lu_spec, format="raw"): 176 lu2prob[formatter(lu)] = self._p(lu, subcount, smooth, cat_totals) 177 178 return lu2prob
179 180
181 - def get_info_content(self, lu_spec, subcount=False, smooth=False, cat_totals=False, 182 format=None):
183 """ 184 Get information content (IC) for lexical units satisfying this 185 specification, defined as the negative log of the lexical unit's 186 probability, i.e. -log_2(lu_count / total_count) 187 188 If a lexical unit has a count of zero, the probability is zero, the 189 log is undefined, and None is returned. Unless the keyword "smooth" is 190 true, in which case the count is smoothed by adding one. 191 192 If no lexical unit matches the specification, an empty mapping is 193 returned. 194 195 >>> pprint(c.get_info_content("plant")) 196 {'plant:noun:1': 14.51769181264614} 197 198 >>> pprint(c.get_info_content("plant", subcount=True)) 199 {'plant:noun:1': 10.482770362490861} 200 201 @param lu_spec: lexical unit specification 202 @type lu_spec: string 203 204 @keyword subcount: use subcounts instead of plain counts 205 @type subcount: bool 206 207 @keyword smooth: smooth counts by adding one to lexical units 208 with a zero count 209 @type smooth: bool 210 211 @keyword cat_totals: use total count for category of lexical unit 212 instead of overall total count 213 @type cat_totals: bool 214 215 @keyword format: output format 216 @type format: 'spec', 'xml', 'raw' 217 218 @return: mapping of lexical units in requested output format 219 to information content 220 @rtype: dict 221 """ 222 formatter = self._get_lex_unit_formatter(format) 223 lu2ic = {} 224 225 for lu in self.get_lex_units(lu_spec, format="raw"): 226 lu2ic[formatter(lu)] = self._IC(lu, subcount, smooth, cat_totals) 227 228 return lu2ic
229 230 231 # corpus-based similarity metrics 232
233 - def resnik_sim(self, lu_spec1, lu_spec2, smooth=False, cat_totals=False, 234 format=None):
235 """ 236 Compute the semantic similarity as decribed in Philip Resnik's paper 237 "Using Information Content to Evaluate Semantic Similarity in a 238 Taxonomy" (1995). It is defined for a pair of concepts c1 and c2 as: 239 240 argmax [IC(c1) + IC(c2) - 2 * IC(lcs)] for all lcs in LCS(c1, c2) 241 242 In other words, the maximum value of the information content over all 243 least common subsumers of two concepts. An important point is that the 244 counts of an LCS, as used in computing its probabilty, is the sum of 245 its own count plus the counts of all concepts that it subsumes. 246 247 As suggested by Resnik, it can be extended to _word_ similarity by 248 taking the maximum over the scores for all concepts that are senses of 249 the word. This means that if just two words are specified - without a 250 category or sense - two sets of matching lexical units are retrieved. 251 For every combination of lexical units from these two sets, the LCS is 252 computed (if any), and the one with the maximum information content is 253 selected. 254 255 If no LCS is found, this can mean two things: 256 1. The two words have no LCS because they truely have nothing 257 in common. In this case we assume the LCS is zero and therefore 258 we return zero. 259 2. The two words should have something in common, but the correct 260 LCS is not present in the Cornetto database. However, since 261 there is no way to know this, we consider this the same as (1), 262 and zero is returned. 263 264 There are two more marginal cases: 265 1. No lexical units in the Cornetto database match the 266 specifications. 267 2. All LCS have a subcount of zero, and no smoothing was applied, 268 so its IC is undefined. 269 In both cases None is returned. 270 271 Notice that it is difficult to compare Resnik's word similarities, 272 because they depend on the subcounts. With identical words, for instance, 273 resnik_sim("iets", "iets") = 1.3113543459343666 whereas 274 resnik_sim("spotje", "spotje") = 25.141834494846584 275 276 @param lu_spec1: first lexical unit(s) specification 277 @param lu_spec2: second lexical unit(s) specification 278 279 @keyword smooth: smooth by adding one to lexical units with a zero count 280 @type smooth: bool 281 282 @keyword cat_totals: use total count for category of lexical unit 283 instead of overall total count 284 @type cat_totals: bool 285 286 @keyword format: output format 287 @type format: 'spec', 'xml', 'raw' 288 289 @return: similarity score greater than or equal to zero 290 @rtype: float or None 291 """ 292 max_sim = None 293 294 # If no matching lex units were found, we return None 295 if self.get_lex_units(lu_spec1) and self.get_lex_units(lu_spec2): 296 lcs_ics = [ self._IC(lcs, True, smooth, cat_totals) 297 for lcs in self.least_common_subsumers(lu_spec1, 298 lu_spec2, 299 format="raw") ] 300 if lcs_ics: 301 # If all lcs have a subount of zero (or no "subcount" attrib), 302 # then lcs_ics is [None, ..., None], and we return None, 303 # because max[None, ..., None]() == None 304 max_sim = max(lcs_ics) 305 else: 306 # Matching lex units do exist but have no lcs, which is 307 # comparable to sharing only "the ultimate abstract", which 308 # has an IC of zero, and therefore similarity is zero. In 309 # reality, of course, the true lcs may simply be missing from 310 # the db. 311 max_sim = 0.0 312 313 return max_sim
314 315
316 - def jiang_conrath_dist(self, lu_spec1, lu_spec2, smooth=False, 317 cat_totals=False, format=None):
318 """ 319 Compute the semantic distance as decribed in Jay Jiang & David 320 Conrath's paper "Semantic Similarity Based on Corpus Statistics and 321 Lexical Taxonomy" (1997). It is defined for a pair of concepts c1 and 322 c2 as: 323 324 argmin [IC(c1) + IC(c2) - 2 * IC(lcs)] for all lcs in LCS(c1, c2) 325 326 This is without the edge and node weighting scheme, which is not 327 implemented here. The measure is extended to a _word_ distance measure 328 by taking the minimum over the scores for all concepts (lexical units) 329 that are senses of the word (cf. doc of resnik_sim). 330 331 If no LCS is found, this can mean two things: 332 1. The two words have no LCS because they truely have nothing 333 in common. In this case we assume the LCS is zero and therefore 334 we return the minimun of IC(c1) + IC(c2). 335 2. The two words should have something in common, but the correct 336 LCS is not present in the Cornetto database. However, since 337 there is no way to know this, we consider this the same as (1), 338 and we return the minimun of IC(c1) + IC(c2). 339 340 There are three more marginal cases: 341 1. No lexical units in the Cornetto database match the 342 specifications. 343 2. All matching lexical units have a subcount of zero, 344 and no smoothing was applied, so their IC is undefined. 345 3. All LCS have a subcount of zero, and no smoothing was applied, 346 so its IC is undefined. This implies that all subsumed 347 lexical units must have a subcount of zero, and therefore 348 (2) must be the case as well. 349 In all of these three cases None is returned. 350 351 @param lu_spec1: first lexical unit(s) specification 352 @param lu_spec2: second lexical unit(s) specification 353 354 @keyword smooth: smooth by adding one to lexical units with a zero count 355 @type smooth: bool 356 357 @keyword cat_totals: use total count for category of lexical unit 358 instead of overall total count 359 @type cat_totals: bool 360 361 @keyword format: output format 362 @type format: 'spec', 'xml', 'raw' 363 364 @return: distance greater than of equal to zero 365 @rtype: float or None 366 """ 367 # It is tempting to simply call self.least_common_subsumers(lu_spec1, 368 # lu_spec2), but that would return _all_ lcs. However, since lu_spec1 369 # and lu_spec2 may be underspecified, they may both refer to sets of 370 # lexical units. Not every returned lcs is neccessarily also a true 371 # lcs for all the combinations of these lexical units. For these 372 # cases, the calculation of ic1 + ic2 - 2 * lcs_ic would be invalid. 373 374 lus1 = self.get_lex_units(lu_spec1, format="raw") 375 lus2 = self.get_lex_units(lu_spec2, format="raw") 376 377 min_dist = None 378 379 # TODO: can be optimized in several ways, e.g. 380 # transtive_closure and IC's for lu2 are calculated multiple times 381 for lu1 in lus1: 382 ic1 = self._IC(lu1, True, smooth, cat_totals) 383 if ic1 is None: 384 # zero count or no "subcount" attrib found 385 continue 386 387 for lu2 in lus2: 388 ic2 = self._IC(lu2, True, smooth, cat_totals) 389 if ic2 is None: 390 # zero count or no "subcount" attrib found 391 continue 392 393 # hmmm, conversion back to spec stinks 394 for lcs in self.least_common_subsumers(self._lu_to_spec(lu1), 395 self._lu_to_spec(lu2), 396 format="raw"): 397 lcs_ic = self._IC(lcs, True, smooth, cat_totals) 398 if lcs_ic is None: 399 # No "subcount" attrib found - this should never happen 400 # Note that the subcount cannot be zero, as in that case 401 # the subcounts of all subsumed lexical units, including 402 # lu1 and lu2, must be zero as well. 403 continue 404 405 new_dist = ic1 + ic2 - 2 * lcs_ic 406 407 # cannot use not "min_dist" here, because min_dist may be zero! 408 # cannot use min() here, because min(0, None) == None 409 if min_dist is None or new_dist < min_dist: 410 min_dist = new_dist 411 412 # At this point we are sure that we have valid ic1 and ic2. 413 # If no lcs was found, we assume lcs_ic is zero, 414 # and thus the sim score is ic1 + ic2. 415 new_dist = ic1 + ic2 416 417 if min_dist is None or new_dist < min_dist: 418 min_dist = new_dist 419 420 return min_dist
421 422
423 - def jiang_conrath_sim(self, lu_spec1, lu_spec2, smooth=False, 424 cat_totals=False, format=None):
425 """ 426 Returns Jiang & Conrath's distance converted to a similarity 427 by means of sim = 1 / (1 + dist). See jiang_conrath_dist 428 429 If the distance is None, so is the similarity. 430 431 The translation from distance to similarity is not uniform. That is, 432 the space between the distances 1 and 2 and between the distances 2 433 and 3 is the same (i.e. 1), but the space between the corresponding 434 similaraties, i.e. between 0.5 and 0.33 and between 0.33 and 0.25, is 435 not. 436 437 @param lu_spec1: first lexical unit(s) specification 438 @param lu_spec2: second lexical unit(s) specification 439 440 @keyword smooth: smooth by adding one to lexical units with a zero count 441 @type smooth: bool 442 443 @keyword cat_totals: use total count for category of lexical unit 444 instead of overall total count 445 @type cat_totals: bool 446 447 @keyword format: output format 448 @type format: 'spec', 'xml', 'raw' 449 450 @return: similarity score between zero and one included. 451 @rtype: float or None 452 """ 453 sim = self.jiang_conrath_dist(lu_spec1, lu_spec2, smooth, cat_totals, 454 format) 455 # sim is None if no matching lexical units were found, 456 # or if any of thenm have a (sub)count of zero 457 if sim is not None: 458 return 1 / ( 1 + sim)
459 460
461 - def lin_sim(self, lu_spec1, lu_spec2, smooth=False, cat_totals=False, 462 format=None):
463 """ 464 Compute the semantic similarity as decribed in the paper Dekang Lin's 465 paper "An information-theoretic definition of similarity" (1998). 466 It is defined for a pair of concepts c1 and c2 as: 467 468 argmax [2 * IC(lcs) / (IC(c1) + IC(c2))] for all lcs in LCS(c1, c2) 469 470 This measure is extended to a _word_ distance measure by taking the 471 maximum over the scores for all concepts (lexical units) that are 472 senses of the word (cf. doc of resnik_sim). 473 474 If no LCS is found, this can mean two things: 475 1. The two words have no LCS because they truely have nothing 476 in common. In this case we assume the IC of the LCS is zero and 477 we return zero. 478 2. The two words should have something in common, but the correct 479 LCS is not present in the Cornetto database. However, since 480 there is no way to know this, we consider this the same as (1), 481 and we return zero. 482 483 There are three more marginal cases: 484 1. No lexical units in the Cornetto database match the 485 specifications. 486 2. All matching lexical units have a subcount of zero, 487 and no smoothing was applied, so their IC is undefined. 488 3. All LCS have a subcount of zero, and no smoothing was applied, 489 so its IC is undefined. This implies that all subsumed 490 lexical units must have a subcount of zero, and therefore 491 (2) must be the case as well. 492 In all of these three cases None is returned. 493 494 495 @param lu_spec1: first lexical unit(s) specification 496 @param lu_spec2: second lexical unit(s) specification 497 498 @keyword smooth: smooth by adding one to lexical units with a zero count 499 @type smooth: bool 500 501 @keyword cat_totals: use total count for category of lexical unit 502 instead of overall total count 503 @type cat_totals: bool 504 505 @keyword format: output format 506 @type format: 'spec', 'xml', 'raw' 507 508 @return: similarity score between zero and one included 509 @rtype: float or None 510 """ 511 # this is alomst identical to jiang_conrath_dist 512 lus1 = self.get_lex_units(lu_spec1, format="raw") 513 lus2 = self.get_lex_units(lu_spec2, format="raw") 514 515 max_sim = None 516 517 for lu1 in lus1: 518 ic1 = self._IC(lu1, True, smooth, cat_totals) 519 if ic1 is None: 520 # zero count or no "subcount" attrib found 521 continue 522 523 for lu2 in lus2: 524 ic2 = self._IC(lu2, True, smooth, cat_totals) 525 if ic2 is None: 526 # zero count or no "subcount" attrib found 527 continue 528 529 for lcs in self.least_common_subsumers(self._lu_to_spec(lu1), 530 self._lu_to_spec(lu2), 531 format="raw"): 532 lcs_ic = self._IC(lcs, True, smooth, cat_totals) 533 if lcs_ic is None: 534 # No "subcount" attrib found - this should never happen 535 # Note that the subcount cannot be zero, as in that case 536 # the subcounts of all subsumed lexical units, including 537 # lu1 and lu2, must be zero as well. 538 continue 539 540 new_sim = (2 * lcs_ic) / float(ic1 + ic2) 541 max_sim = max(max_sim, new_sim) 542 543 # At this point we are sure that we have valid ic1 and ic2. 544 # If no lcs was found, we assume lcs_ic is zero, 545 # and thus the sim score is zero. 546 # Hence, without any max_sim yet, we assume max_sim is zero. 547 max_sim = max(max_sim, 0.0) 548 549 return max_sim
550 551 552 # ------------------------------------------------------------------------------ 553 # Semi-private methods 554 # ------------------------------------------------------------------------------ 555
556 - def _get_lu_count(self, lu, subcount=False):
557 """ 558 get (sub)count of lexical unit 559 """ 560 try: 561 if subcount: 562 return int(lu.find("form").get("subcount")) 563 else: 564 return int(lu.find("form").get("count")) 565 except AttributeError: 566 # no <form> elem 567 return None 568 except TypeError: 569 # no "(sub)count" attrib (or non-int value) 570 return None
571 572
573 - def _p(self, lu, subcount=False, smooth=False, cat_totals=False):
574 """ 575 probility on the basis of MLE using (sub)counts 576 """ 577 lu_count = self._get_lu_count(lu, subcount) 578 579 if lu_count is None: 580 return None 581 582 # probability with optional smoothing by add-one 583 if lu_count == 0 and smooth: 584 lu_count = 1 585 586 # optionally use the category-specific totals 587 if cat_totals: 588 # silently back-off to "all" if no cat was found 589 cat = self._get_lu_cat(lu) or "all" 590 else: 591 cat = "all" 592 593 count_total = float(self._cat2counts[cat]) 594 595 return lu_count / float(count_total)
596 597
598 - def _IC(self, lu, subcount=False, smooth=False, cat_totals=False, base=2):
599 """ 600 Information Content 601 """ 602 try: 603 return -log(self._p(lu, subcount, smooth, cat_totals), base) 604 except OverflowError: 605 # zero probability: log(0) not defined 606 return None 607 except TypeError: 608 # _prob returned None because no count was found 609 return None
610 611 612 613 614 # Debugging code 615 616 # Parsing cdb takes a long time. Therefore debugging is much faster if we 617 # parse just once, like this:start_server 618 # 619 # >>> import cornetto.simcornet as cornet; cornet._parse() 620 # 621 # While debugging we inject the tables and graph in a Cornet instance: 622 # 623 # >>> reload(cornet); c = cornet._get_cornet_instance() 624 # 625 626 #def _parse(cdb_lu="cdb_lu_minimal_subcounts.xml", cdb_syn="cdb_syn_minimal.xml"): 627 #global form2lu, c_lu_id2lu, c_sy_id2synset, graph, cat2counts 628 #form2lu, c_lu_id2lu, c_sy_id2synset, graph, cat2counts = \ 629 #parse_cdb_with_counts(cdb_lu, cdb_syn, verbose=False) 630 631 632 #def _get_cornet_instance(): 633 #c = SimCornet() 634 #c._form2lu = form2lu 635 #c._c_lu_id2lu = c_lu_id2lu 636 #c._c_sy_id2synset = c_sy_id2synset 637 #c._graph = graph 638 #c._cat2counts = cat2counts 639 #return c 640