# This work is licensed under the Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License. # To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/us/ or send a letter to # Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA. # written by Jeff Huang # cite: Huang, J. and Efthimiadis, E. N. 2009. Analyzing and evaluating query reformulation strategies in web search logs. In Proceeding of CIKM '09, 77-86. # http://jeffhuang.com/cikm09_final.pdf from __future__ import division import sys import os import time # Third Party Libraries import numpy # http://numpy.scipy.org/ from porter import PorterStemmer # http://tartarus.org/~martin/PorterStemmer/python.txt from nltk.wordnet import * # http://www.nltk.org/download # The first argument to this program is the input file # Reformulation: Spelling Correction # Returns levenshtein edit distance def spellingCorrection(a, b): table = numpy.zeros((len(a)+1, len(b)+1)) for i in range(len(a)+1): table[i, 0] = i for j in range(len(b)+1): table[0, j] = j for i in range(1, len(a)+1): for j in range(1, len(b)+1): if a[i-1] == b[j-1]: cost = 0 else: cost = 1 table[i,j] = min(table[i-1,j]+1, table[i,j-1]+1, table[i-1,j-1]+cost) return table[len(a), len(b)] def wordSubstitutionHelper(a, b): aBase = morphy(a) bBase = morphy(b) if not aBase or not bBase: return False for type in [N, V, ADJ, ADV]: try: for synset in type[aBase]: for word in synset: if word.replace('_', ' ') == bBase: return True for relation in synset.relations().items(): #relationType = relation[0] for wordOrSynset in relation[1]: if isinstance(wordOrSynset, str): if wordOrSynset.replace('_', ' ') == bBase: return True else: # it's a synset for word in wordOrSynset: if word.replace('_', ' ') == bBase: return True except KeyError: pass try: for synset in type[bBase]: for word in synset: if word.replace('_', ' ') == aBase: return True for relation in synset.relations().items(): #relationType = relation[0] for wordOrSynset in relation[1]: if isinstance(wordOrSynset, str): if wordOrSynset.replace('_', ' ') == aBase: return True else: # it's a synset for word in wordOrSynset: if word.replace('_', ' ') == aBase: return True except KeyError: pass return False # Reformulation: Word Substitution def wordSubstitution(a, b): aWords = a.split() bWords = b.split() if wordSubstitutionHelper(a, b): return True if len(aWords) != len(bWords): return False for i in range(len(aWords)): if aWords[i] == bWords[i]: continue if wordSubstitutionHelper(aWords[i], bWords[i]): continue return False return True # Reformulation: Acronym def formAcronym(a, b): if len(a.split()) <= 1 and len(b.split()) <= 1: return False if b.replace(".", "").replace("-", "") == "".join(i[0] for i in a.split()): return True return False # Reformulation: Word Reorder def wordReorder(a, b): aWords = [] bWords = [] for split1 in a.split(): for split2 in split1.split('-'): for split3 in split2.split(','): aWords.append(split3) for split1 in b.split(): for split2 in split1.split('-'): for split3 in split2.split(','): bWords.append(split3) aWords.sort() bWords.sort() return aWords == bWords # Reformulation: Add Words, Remove Words def addWords(a, b): aWords = a.split() bWords = b.split() for aWord in aWords: if aWord in bWords: continue return False return True # Reformulation: Abbreviation def abbreviation(a, b): aWords = a.split() bWords = b.split() if len(aWords) != len(bWords): return False for i in range(len(aWords)): if not (aWords[i].startswith(bWords[i]) or bWords[i].startswith(aWords[i])): return False return True # Reformulation: Stemming def stemCompare(a, b): p = PorterStemmer() aWords = a.split() bWords = b.split() if len(aWords) != len(bWords): return False for i in range(len(aWords)): if p.stem(aWords[i], 0, len(aWords[i])-1) != p.stem(bWords[i], 0, len(bWords[i])-1): return False return True # Reformulation: Strip URL Components def urlStrip(a, b): aWords = a.split() bWords = b.split() done = False while not done: try: aWords.remove('http') except ValueError: done = True done = False while not done: try: bWords.remove('http') except ValueError: done = True if len(aWords) != len(bWords): return False for i in range(len(aWords)): if aWords[i].startswith('www.'): aWords[i] = aWords[i][4:] if aWords[i].endswith('.com'): aWords[i] = aWords[i][:-4] if bWords[i].startswith('www.'): bWords[i] = bWords[i][4:] if bWords[i].endswith('.com'): bWords[i] = bWords[i][:-4] if aWords[i] != bWords[i]: # might want to do compounding here too return False return True # Reformulation: Whitespace and Punctuation def whitespacePunctuation(a, b): return a.replace(" ", "").replace(".", "").replace("-", "") == b.replace(" ", "").replace(".", "").replace("-", "") # CODE STARTS RUNNING HERE if __name__ == '__main__': prevUserId = None prevQuery = None prevURL = None prevRank = None prevTimestamp = None data = {} data['same'] = data['wordReorder'] = data['whitespacePunctuation'] = data['addWords'] = data['removeWords'] = data['urlStrip'] = data['stemCompare'] = data['wordSubstitution'] = data['formAcronym'] = data['expandAcronym']= data['abbreviation'] = data['spellingCorrection'] = data['substring'] = data['superstring'] = data['new'] = 0 for line in open(sys.argv[1]): #print "LINE:", line.strip() columns = line.split("\t") try: userId = int(columns[0]) except ValueError: continue query = columns[1] timestamp = columns[2] if columns[3]: try: # there is one bad line :( rank = int(columns[3]) except ValueError: continue else: rank = None url = columns[4].strip() if query == "-" or prevQuery == "-" or userId != prevUserId: prevUserId = userId prevQuery = query prevTimestamp = timestamp prevRank = rank prevUrl = url continue if url and prevUrl: urlSame = (url == prevUrl) else: urlSame = None if rank and prevRank: rankChange = prevRank - rank # rankChange is how much the second query click went UP else: rankChange = None if rank and prevRank: clickPattern = "ClickClick" elif rank and not prevRank: clickPattern = "SkipClick" elif not rank and prevRank: clickPattern = "ClickSkip" else: clickPattern = "SkipSkip" timeDiff = time.mktime(time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")) - time.mktime(time.strptime(prevTimestamp, "%Y-%m-%d %H:%M:%S")) if prevQuery == query: type = 'same' elif wordReorder(prevQuery, query): type = 'wordReorder' elif whitespacePunctuation(prevQuery, query): type = 'whitespacePunctuation' elif addWords(prevQuery, query): type = 'addWords' elif addWords(query, prevQuery): # this is removeWords type = 'removeWords' elif urlStrip(prevQuery, query): type = 'urlStrip' elif stemCompare(prevQuery, query): type = 'stemCompare' elif formAcronym(prevQuery, query): type = 'formAcronym' elif formAcronym(query, prevQuery): # this is expandAcronym type = 'expandAcronym' elif abbreviation(prevQuery, query): # this is a single metric because you can abbreviate terms in either query at the same time type = 'abbreviation' # Reformulation: Query Substring elif prevQuery.startswith(query) or prevQuery.endswith(query): type = 'substring' # Reformulation: Query Superstring elif query.startswith(prevQuery) or query.endswith(prevQuery): type = 'superstring' elif wordSubstitution(prevQuery, query): type = 'wordSubstitution' elif spellingCorrection(prevQuery, query) <= 2: type = 'spellingCorrection' else: type = 'new' prevQuery = query prevTimestamp = timestamp prevRank = rank prevUrl = url print type, ',', userId, ',', int(timeDiff), ',', urlSame, ',', rankChange, ',', clickPattern