#!/usr/local/bin/python
# -*- coding: utf-8 -*-

import sys
import cgi
import MySQLdb
import codecs
import string
import re
from HTMLParser import HTMLParser

import codecs
import re
import sys
import os
import datetime
from time import gmtime, strftime


# from http://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

# class for removing HTML tags
# from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

# remove HTML tags
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# extract and process a range of sections from a search string; e.g. given 'meno 98a - 100b', retrieve sections 
# 98a - 100b of Meno.
def get_range (cur,search_term,base_url):


   m = re.search("\s*([a-zA-Z ]+( [12] ){0,1})\s*(\w+)\s*\-\s*(\w+)$",search_term)
   if (m):
      source = m.group(1)
      first_section = m.group(3)
      last_section = m.group(4)
   else:
      m = re.search("\s*([a-zA-Z ]+( [12] ){0,1})\s*(\w+)\s*$",search_term)
      if (m):
         source = m.group(1)
         first_section = m.group(3)
         last_section = first_section 
      else:
         return False

   # NOTE: if needed, add code to prevent SQL injection here

   cmd = "SELECT source1,sec2,text2 FROM (SELECT e1.source as source1, e1.section as sec1, e1.id as id1, e2.section as sec2, e2.id as id2,e2.text as text2 FROM greek e1 LEFT JOIN greek e2 ON e1.source = '%s' AND e2.id >= e1.id AND e2.source = e1.source WHERE e1.section = '%s') AS t1 LEFT JOIN greek e3 ON id2 <= e3.id AND source1 = e3.source WHERE e3.section = '%s';" % (source,first_section,last_section)

   cur.execute(cmd)
   count = 0
   words = "" 
   for i in cur:
      count += 1
      link = '<a href="' + base_url + i[0] + '.html">' + i[0] + "</a> " + i[1] + '<br />\n' 
      print "<hr />"
      print "<h4>" + str(count) + ". " + link + "</h4>"
      text = i[2]
      print text.replace(search_term,'<span style = "color: red;">' + search_term + "</span>") 
      words += text

   # remove HTML tags
   words = strip_tags(words)

   # remove sections (e.g. [90b])
   words = re.sub('\[\d+[a-z]+\]','',words)
   words = re.sub("[\:\.\,\;]","",words)
  
   # common words - these can be omitted or greyed-out if desired
   stop_words = ['καὶ','δὲ','ἂν','ὅτι','ὦ','ἢ','μὲν','οὐκ','τὸ','εἰ','οὐ','τῶν','ἀλλὰ','γὰρ','μὴ','τί','τοῦτο','οὖν','ἐν','ἡ','τε','ςοι','μοι','γε','περὶ','δὴ','τὰ','ὡς','τὴν','οἱ','πάνυ','τοῦ','οὐκοῦν',"ἀλλ'",'ναί','τῆς','ςὺ','τοὺς','οὐδὲν','ὃ','τὸν','τῷ',"τι'",'ὁ','ἄρα','ταῦτα','νῦν','τις','ὥςπερ',"δ'",'ἀπὸ','τοῖς','ἐγὼ','δέ','εἴτε','αὐτὸ','κατὰ','αὐτῷ','πρὸς','οὐδὲ','ἄλλο','εἰς','οὕτως','οὐχ','πῶς','αὐτὸν','ἔγωγε','μετὰ','παρὰ','ἄν','εἴ','τούτων','τῇ','τοτὲ','ὧν','ἡμῖν','δεῖ','τούτου','μήτε','ἐπειδὴ','τὰς','ἃ','ὅταν','αὐτὸς','εἰπεῖν',"οὐδ'",'τούτῳ','οἷον','εἴπερ','οὗ','μέν','που','γέ','εὖ','ἐκ','οὗτος','ἔτι','γάρ','τοίνυν','ἄλλα','πάντα','ἀεὶ','ὥςτε','τούτους','οὔτε','μᾶλλον','ἔμοιγε',"ἆρ'",'τόδε','οὕτω','ἔγωγε','ἔμοιγε.','μὴν','ἐπὶ','ἦν','αὐτῶν','διὰ','οὓς','ἵνα','ἄρτι','οἷόν','αὐτοῦ','κακὰ',"παρ'",'ᾖ','ἆρα',"τ'",'ςε','ταύτης','με','ἐμοὶ','πότερον','ἐμοῦ','ἄλλῳ','οὔ;','δή,','ἐξ','ἐγώ','αὕτη','ἄλλους','ποτε','τοῦτό','μή,',"γ'",'μὰ','τοῦτον','αὐτὰ','οὔ','οὗτοι','πάντων','αὐτῆς','αὐτὴν','νυνδὴ','ταύτῃ','ἄλλοις','ἄλλων','ἡμᾶς','ἣν','εἰπὲ','ὅδε','τίς','ἐὰν','ἔχω','μηδὲ','ςοῦ','τίνας','ἤδη',"καθ'",'ᾧ','αὑτοῦ','πρὶν', 'ἐάν', 'πρὸ', 'ςὸς', 'ὑμᾶς', 'τινὶ', 'ὑμῖν', 'τι','δή','μή','ςύ']
  
   w = words.split()
   w.sort()
   word_counts = ""
   amt = []
   for i in set(w):
      amt.append([i,w.count(i)])
   amt.sort(key = lambda tup : tup[1], reverse = True)
   
   for i in amt:
      row = i[0] + ": " + str(i[1])
      if i[0] in stop_words:
         row = '<div class="word_count_row stop_word">' + i[0] + '</div><div class="word_count stop_word">' + str(i[1]) + '</div>'
      else:
         row = '<div class="word_count_row">' + i[0] + '</div><div class="word_count">' + str(i[1]) + '</div>'
      word_counts += row

   return word_counts

# perform database query for passages with search terms
def search_for_term(search_term, base_url):

   table = "greek"
   if (is_ascii(search_term)):
      table = "english"

   source = ""
   m = re.search("\s*in:\s*(.*)",search_term,re.IGNORECASE)
   if (m):
      source = m.group(1)
   search_term = re.sub("\s*in:\s*(.*)","",search_term,re.IGNORECASE)

    
   # NOTE: if needed, add code to prevent SQL injection here
   if source:
      cur.execute("select * from " + table + " where text regexp '" + search_term + "' and source = '" + source + "';")
   else:
      cur.execute("select * from " + table + " where text regexp '" + search_term + "';")

   count = 0
   for i in cur:

      count += 1
      link = '<a href="' + base_url + i[1] + '.html">' + i[1] + "</a> " + i[2] + '<br />\n' 
      text = i[3]
      #print text.replace(search_term,'<span style = "color: red;">' + search_term + "</span>") 
      hi = re.sub("(?i)(>[^<]*)(" + search_term + ")", "\\1" + '<span style = "color: red;">' + "\\2" + "</span>",text)
      if (hi != text):
         print "<hr />"
         print "<h4>" + str(count) + ". " + link + "</h4>"
         print hi         

# Main
# perform search and generate HTML results
# =============================================================================================================

#environment variables:
base_url = "[e.g. http://www...]"
search_results_html_dir = "..."
db_settings = {
   'host': "localhost",
   'user': "[username]",
   'passwd': "[password]",
   'db': "[db name]"
}

print "Content-Type: text/html; charset=UTF-8"
print ""

header = ""
f = open(search_results_html_dir + "search-results-header.txt","r")
for i in f:
   header += i 
footer = ""
f = open(search_results_html_dir + "search-results-footer.txt","r")
for i in f:
   footer += i 

print header 

form = cgi.FieldStorage()

try:
   search_term = form["a"].value 
except:
   print "No search term specified."

# Convert from 'Greek and Coptic' to 'Extended Greek'
# (two different Unicode sets)
search_term = search_term.replace('ά','ά')
search_term = search_term.replace('έ','έ')
search_term = search_term.replace('ή','ή')
search_term = search_term.replace('ί','ί')
search_term = search_term.replace('ό','ό')
search_term = search_term.replace('ύ','ύ')
search_term = search_term.replace('ώ','ώ')
search_term = search_term.replace('Ά','Ά')
search_term = search_term.replace('Έ','Έ')
search_term = search_term.replace('Ή','Ή')
search_term = search_term.replace('ΐ','ΐ')
search_term = search_term.replace('Ί','Ί')
search_term = search_term.replace('ΰ','ΰ')
search_term = search_term.replace('Ύ','Ύ')
search_term = search_term.replace('΅','΅')
search_term = search_term.replace('`','`')
search_term = search_term.replace('Ό','Ό')
search_term = search_term.replace('Ώ','Ώ')
search_term = search_term.replace('´','´')

search_term = re.sub("[\"\'\$\\\]","",search_term)

print "<h3>Search Results</h3>"
print "search term(s): " + search_term

db = MySQLdb.Connection(host = db_settings['host'],user = db_settings['user'], passwd = db_settings['passwd'],db = db_settings['db'])


cur = db.cursor()

word_counts = get_range(cur,search_term,base_url) 
if (not word_counts):
   search_for_term(search_term, base_url)

db.close()

print """
</div> <!-- search_results -->

<div id = "report">
"""
if (word_counts):
   print "<h3>Word counts:</h3>"
   print word_counts

print footer

