# This work is licensed under the Creative Commons Attribution-Noncommercial-Share Alike 3.0 United States License.
# To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/3.0/us/ or send a letter to
# Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA.

# written by Jeff Huang
# cite: Huang, J. and Efthimiadis, E. N. 2009. Analyzing and evaluating query reformulation strategies in web search logs. In Proceeding of CIKM '09, 77-86.
# http://jeffhuang.com/cikm09_final.pdf


from __future__ import division
import sys
import os
import time

# Third Party Libraries
import numpy # http://numpy.scipy.org/
from porter import PorterStemmer # http://tartarus.org/~martin/PorterStemmer/python.txt
from nltk.wordnet import * # http://www.nltk.org/download

# The first argument to this program is the input file

# Reformulation: Spelling Correction
# Returns levenshtein edit distance
def spellingCorrection(a, b):
	table = numpy.zeros((len(a)+1, len(b)+1))
	for i in range(len(a)+1):
		table[i, 0] = i
	for j in range(len(b)+1):
		table[0, j] = j

	for i in range(1, len(a)+1):
		for j in range(1, len(b)+1):
			if a[i-1] == b[j-1]:
				cost = 0
			else:
				cost = 1
			table[i,j] = min(table[i-1,j]+1, table[i,j-1]+1, table[i-1,j-1]+cost)

	return table[len(a), len(b)]


def wordSubstitutionHelper(a, b):
	aBase = morphy(a)
	bBase = morphy(b)
	if not aBase or not bBase:
		return False

	for type in [N, V, ADJ, ADV]:
		try:
			for synset in type[aBase]:
				for word in synset:
					if word.replace('_', ' ') == bBase:
						return True
					for relation in synset.relations().items():
						#relationType = relation[0]
						for wordOrSynset in relation[1]:
							if isinstance(wordOrSynset, str):
								if wordOrSynset.replace('_', ' ') == bBase:
									return True
							else: # it's a synset
								for word in wordOrSynset:
									if word.replace('_', ' ') == bBase:
										return True

		except KeyError:
			pass
		try:
			for synset in type[bBase]:
				for word in synset:
					if word.replace('_', ' ') == aBase:
						return True
					for relation in synset.relations().items():
						#relationType = relation[0]
						for wordOrSynset in relation[1]:
							if isinstance(wordOrSynset, str):
								if wordOrSynset.replace('_', ' ') == aBase:
									return True
							else: # it's a synset
								for word in wordOrSynset:
									if word.replace('_', ' ') == aBase:
										return True
		except KeyError:
			pass

	return False

# Reformulation: Word Substitution
def wordSubstitution(a, b):
	aWords = a.split()
	bWords = b.split()

	if wordSubstitutionHelper(a, b):
		return True

	if len(aWords) != len(bWords):
		return False

	for i in range(len(aWords)):
		if aWords[i] == bWords[i]:
			continue
		if wordSubstitutionHelper(aWords[i], bWords[i]):
			continue
		return False
	return True

# Reformulation: Acronym
def formAcronym(a, b):
	if len(a.split()) <= 1 and len(b.split()) <= 1:
		return False
	if b.replace(".", "").replace("-", "") == "".join(i[0] for i in a.split()):
		return True
	return False

# Reformulation: Word Reorder
def wordReorder(a, b):
	aWords = []
	bWords = []
	for split1 in a.split():
		for split2 in split1.split('-'):
			for split3 in split2.split(','):
				aWords.append(split3)
	for split1 in b.split():
		for split2 in split1.split('-'):
			for split3 in split2.split(','):
				bWords.append(split3)
	aWords.sort()
	bWords.sort()
	return aWords == bWords

# Reformulation: Add Words, Remove Words
def addWords(a, b):
	aWords = a.split()
	bWords = b.split()
	for aWord in aWords:
		if aWord in bWords:
			continue
		return False
	return True

# Reformulation: Abbreviation
def abbreviation(a, b):
	aWords = a.split()
	bWords = b.split()
	if len(aWords) != len(bWords):
		return False

	for i in range(len(aWords)):
		if not (aWords[i].startswith(bWords[i]) or bWords[i].startswith(aWords[i])):
			return False
	return True

# Reformulation: Stemming
def stemCompare(a, b):
	p = PorterStemmer()
	aWords = a.split()
	bWords = b.split()
	if len(aWords) != len(bWords):
		return False

	for i in range(len(aWords)):
		if p.stem(aWords[i], 0, len(aWords[i])-1) != p.stem(bWords[i], 0, len(bWords[i])-1):
			return False
	return True

# Reformulation: Strip URL Components
def urlStrip(a, b):
	aWords = a.split()
	bWords = b.split()

	done = False
	while not done:
		try:
			aWords.remove('http')
		except ValueError:
			done = True

	done = False
	while not done:
		try:
			bWords.remove('http')
		except ValueError:
			done = True

	if len(aWords) != len(bWords):
		return False

	for i in range(len(aWords)):
		if aWords[i].startswith('www.'):
			aWords[i] = aWords[i][4:]
		if aWords[i].endswith('.com'):
			aWords[i] = aWords[i][:-4]
		if bWords[i].startswith('www.'):
			bWords[i] = bWords[i][4:]
		if bWords[i].endswith('.com'):
			bWords[i] = bWords[i][:-4]
		if aWords[i] != bWords[i]: # might want to do compounding here too
			return False

	return True

# Reformulation: Whitespace and Punctuation
def whitespacePunctuation(a, b):
	return a.replace(" ", "").replace(".", "").replace("-", "") == b.replace(" ", "").replace(".", "").replace("-", "")


# CODE STARTS RUNNING HERE

if __name__ == '__main__':
	prevUserId = None
	prevQuery = None
	prevURL = None
	prevRank = None
	prevTimestamp = None
	data = {}
	data['same'] = data['wordReorder'] = data['whitespacePunctuation'] = data['addWords'] = data['removeWords'] = data['urlStrip'] = data['stemCompare'] = data['wordSubstitution'] = data['formAcronym'] = data['expandAcronym']= data['abbreviation'] = data['spellingCorrection'] = data['substring'] = data['superstring'] = data['new'] = 0
	for line in open(sys.argv[1]):
		#print "LINE:", line.strip()
		columns = line.split("\t")
		try:
			userId = int(columns[0])
		except ValueError:
			continue

		query = columns[1]
		timestamp = columns[2]
		if columns[3]:
			try: # there is one bad line :(
				rank = int(columns[3])
			except ValueError:
				continue
		else:
			rank = None
		url = columns[4].strip()

		if query == "-" or prevQuery == "-" or userId != prevUserId:
			prevUserId = userId
			prevQuery = query
			prevTimestamp = timestamp
			prevRank = rank
			prevUrl = url
			continue

		if url and prevUrl:
			urlSame = (url == prevUrl)
		else:
			urlSame = None
		if rank and prevRank:
			rankChange = prevRank - rank # rankChange is how much the second query click went UP
		else:
			rankChange = None

		if rank and prevRank:
			clickPattern = "ClickClick"
		elif rank and not prevRank:
			clickPattern = "SkipClick"
		elif not rank and prevRank:
			clickPattern = "ClickSkip"
		else:
			clickPattern = "SkipSkip"

		timeDiff = time.mktime(time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")) - time.mktime(time.strptime(prevTimestamp, "%Y-%m-%d %H:%M:%S"))

		if prevQuery == query:
			type = 'same'

		elif wordReorder(prevQuery, query):
			type = 'wordReorder'

		elif whitespacePunctuation(prevQuery, query):
			type = 'whitespacePunctuation'

		elif addWords(prevQuery, query):
			type = 'addWords'

		elif addWords(query, prevQuery): # this is removeWords
			type = 'removeWords'

		elif urlStrip(prevQuery, query):
			type = 'urlStrip'

		elif stemCompare(prevQuery, query):
			type = 'stemCompare'

		elif formAcronym(prevQuery, query):
			type = 'formAcronym'

		elif formAcronym(query, prevQuery): # this is expandAcronym
			type = 'expandAcronym'

		elif abbreviation(prevQuery, query): # this is a single metric because you can abbreviate terms in either query at the same time
			type = 'abbreviation'

		# Reformulation: Query Substring
		elif prevQuery.startswith(query) or prevQuery.endswith(query):
			type = 'substring'

		# Reformulation: Query Superstring
		elif query.startswith(prevQuery) or query.endswith(prevQuery):
			type = 'superstring'

		elif wordSubstitution(prevQuery, query):
			type = 'wordSubstitution'

		elif spellingCorrection(prevQuery, query) <= 2:
			type = 'spellingCorrection'

		else:
			type = 'new'

		prevQuery = query
		prevTimestamp = timestamp
		prevRank = rank
		prevUrl = url

		print type, ',', userId, ',', int(timeDiff), ',', urlSame, ',', rankChange, ',', clickPattern
