# # By Gilbert Le Blanc # http://www.python-cgi-programming.com/ # webmaster@python-cgi-programming.com # # Released under GNU Public License # # Copyright (C) # This program is free software; you can redistribute it and/or modify it under the terms of the # GNU General Public License as published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # http://www.opensource.org/licenses/gpl-license.html # This General Public License does not permit incorporating your program into proprietary programs. # # September 15, 2007 - Version 1.03 # Add commas to the large reference counts. # Added more new search engine URL's # # November 20, 2005 - Version 1.02 # Escape (\) any single or double quotes in the Referrer Title so that the document.write # Javascript functions correctly. # Added new search engine URL's. # # October 10, 2005 - Version 1.01 # Check to make sure the document.location is in the form http://www.name.com/ or http://name.com/ # # October 8, 2005 - Version 1.00 # import os import string import sys import time import urllib from SQLFunctions import * def AddCommas(number): """This routine adds commas to a number and returns a string""" number_text = str(number) number_length = len(number_text) if (number_length == 4): number_text = number_text[0:1] + ',' + number_text[1:] elif (number_length == 5): number_text = number_text[0:2] + ',' + number_text[2:] elif (number_length == 6): number_text = number_text[0:3] + ',' + number_text[3:] elif (number_length == 7): number_text = number_text[0:1] + ',' + number_text[1:4] + ',' + number_text[4:] elif (number_length == 8): number_text = number_text[0:2] + ',' + number_text[2:5] + ',' + number_text[5:] elif (number_length == 9): number_text = number_text[0:3] + ',' + number_text[3:6] + ',' + number_text[6:] return number_text def check_digits (URL_in): """ This function checks to see if the referrer URL is an IP address """ pos1 = URL_in.find('//') + 2 pos2 = URL_in.find('/', pos1) part = URL_in[pos1:pos2] flag = "T" for char in part: if (char == '.'): continue elif (char == ':'): continue elif (char.isdigit() == 1): continue else: flag = "F" break return flag def check_local (URL_in, URL_out): """ This function checks to see if the referrer URL is from the local domain.""" f_in = find_filename(URL_in) f_out = find_filename(URL_out) if (f_in == f_out): return "local" else: return "referrer" def clean_url (URL_in): """ This function cleans up a referrer URL before we store it in the file.""" # Determine the number of forward slashes in the URL slash_count = URL_in.count('/') # Standardize search engine URLs if (URL_in.find('images.google') >= 0): URL_in = 'http://images.google.com/' elif (URL_in.find('google') >= 0): URL_in = 'http://www.google.com/' if (URL_in.find('images.search.yahoo') >= 0): URL_in = 'http://images.search.yahoo.com/' elif (URL_in.find('search.yahoo') >= 0): URL_in = 'http://search.yahoo.com/' if (URL_in.find('altavista.com/image') >= 0): URL_in = 'http://www.altavista.com/image/' elif (URL_in.find('altavista.com') >= 0): URL_in = 'http://www.altavista.com/' if (URL_in.find('pictures.ask.com') >= 0): URL_in = 'http://pictures.ask.com/' elif (URL_in.find('ask.com') >= 0): URL_in = 'http://www.ask.com/' if (URL_in.find('aol.com') >= 0): URL_in = 'http://search.aol.com/' if (URL_in.find('search.msn.com') >= 0): URL_in = 'http://search.msn.com/' if (URL_in.find('answers.com') >= 0): URL_in = 'http://www.answers.com/' if (URL_in.find('excite.com') >= 0): URL_in = 'http://www.excite.com/' if (URL_in.find('dogpile.com') >= 0): URL_in = 'http://www.dogpile.com/' if (URL_in.find('use.com') >= 0): URL_in = 'http://www.use.com/' if (URL_in.find('a9.com') >= 0): URL_in = 'http://www.a9.com/' if (URL_in.find('alltheweb.com') >= 0): URL_in = 'http://www.alltheweb.com/' if (URL_in.find('att.net') >= 0): URL_in = 'http://www.att.net/' if (URL_in.find('avantfind.com') >= 0): URL_in = 'http://www.avantfind.com/' if (URL_in.find('chacha.com') >= 0): URL_in = 'http://search.chacha.com/' if (URL_in.find('crawler.com') >= 0): URL_in = 'http://www.crawler.com/' if (URL_in.find('comcast.net') >= 0): URL_in = 'http://www.comcast.net/' if (URL_in.find('earthlink.net') >= 0): URL_in = 'http://search.earthlink.net/' if (URL_in.find('foxnews.com') >= 0): URL_in = 'http://search.foxnews.com/' if (URL_in.find('infospace.com') >= 0): URL_in = 'http://www.infospace.com/home/search/' if (URL_in.find('live.com') >= 0): URL_in = 'http://www.live.com/' if (URL_in.find('mamma.com') >= 0): URL_in = 'http://www.mamma.com/' if (URL_in.find('metacrawler.com') >= 0): URL_in = 'http://www.metacrawler.com/' if (URL_in.find('mysearch.com') >= 0): URL_in = 'http://www.mysearch.com/' if (URL_in.find('mysearch.myway.com') >= 0): URL_in = 'http://www.mysearch.com/' if (URL_in.find('myway.com') >= 0): URL_in = 'http://search.myway.com/' if (URL_in.find('mywebsearch.com') >= 0): URL_in = 'http://www.mywebsearch.com/' if (URL_in.find('oveture.com') >= 0): URL_in = 'http://www.content.overture.com/' if (URL_in.find('oxysearch.com') >= 0): URL_in = 'http://search.oxysearch.com/' if (URL_in.find('redzip.com') >= 0): URL_in = 'http://www.redzip.com/' if (URL_in.find('.rr.com') >= 0): URL_in = 'http://www.rr.com/' if (URL_in.find('search.com') >= 0): URL_in = 'http://www.search.com/' if (URL_in.find('searchalot.com') >= 0): URL_in = 'http://www.searchalot.com/' if (URL_in.find('searchxyz.com') >= 0): URL_in = 'http://www.searchxyz.com/' if (URL_in.find('stumbleupon.com') >= 0): URL_in = 'http://www.stumbleupon.com/' if (URL_in.find('web.info.com') >= 0): URL_in = 'http://www.infospace.com/home/search/' if (URL_in.find('webcrawler.com') >= 0): URL_in = 'http://www.webcrawler.com/' if (URL_in.find('websearch.com') >= 0): URL_in = 'http://www.websearch.com/' if (URL_in.find('windstream.net') >= 0): URL_in = 'http://www.windstream.net/' # Change http://name.com/ to http://www.name.com/ pos1 = URL_in.find('//') + 2 pos2 = URL_in.find('/', pos1) part = URL_in[pos1:pos2] list = part.split('.') if (len(list) == 2): URL_in = URL_in[:pos1] + 'www.' + URL_in[pos1:] # Remove CGI variables from referrer URL pos = URL_in.find('?') if (pos >= 0): URL_in = URL_in[0:pos] # Remove index.? from the end of the referrer URL pos1 = URL_in.rfind('/') pos2 = URL_in.find('/index.', pos1) if (pos2 >= 0): if (pos2 == pos1): URL_in = URL_in[:pos2] return URL_in def extract_data (in_string, start, end, pos): """ This function extracts keyword data from a string. """ slen = len(start) spos = in_string.find(start, pos) epos = in_string.find(end, spos+slen) if (spos >= 0): if (epos >= 0): extract = in_string[spos+slen:epos] else: extract = in_string[spos+slen:] else: extract = None return extract def find_filename (URL_out): """ This function gets the domain from the URL.""" http_start = 'http://' www_start = 'www.' http_start_length = len(http_start) www_start_length = len(www_start) i = URL_out.find(http_start) k = URL_out.find(www_start) j = URL_out.find('/', i+http_start_length) if (k >= 0): if (j >= 0): s_out = URL_out[k+www_start_length:j] else: s_out = URL_out[k+www_start_length:] else: if (j >= 0): s_out = URL_out[i+http_start_length:j] else: s_out = URL_out[i+http_start_length:] return s_out def get_page_title (URL_in): """ This function gets a page title from the HTML returned by the URL. If no page title exists, the URL is truncated and returned as the title.""" # print URL_in title_start_tag = '