# # By Gilbert Le Blanc # http://www.python-cgi-programming.com/ # webmaster@python-cgi-programming.com # # Released under GNU Public License # # Copyright (C) # This program is free software; you can redistribute it and/or modify it under the terms of the # GNU General Public License as published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without # even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # http://www.opensource.org/licenses/gpl-license.html # This General Public License does not permit incorporating your program into proprietary programs. # # September 15, 2007 - Version 1.03 # Add commas to the large reference counts. # Added more new search engine URL's # # November 20, 2005 - Version 1.02 # Escape (\) any single or double quotes in the Referrer Title so that the document.write # Javascript functions correctly. # Added new search engine URL's. # # October 10, 2005 - Version 1.01 # Check to make sure the document.location is in the form http://www.name.com/ or http://name.com/ # # October 8, 2005 - Version 1.00 # import os import string import sys import time import urllib from SQLFunctions import * def AddCommas(number): """This routine adds commas to a number and returns a string""" number_text = str(number) number_length = len(number_text) if (number_length == 4): number_text = number_text[0:1] + ',' + number_text[1:] elif (number_length == 5): number_text = number_text[0:2] + ',' + number_text[2:] elif (number_length == 6): number_text = number_text[0:3] + ',' + number_text[3:] elif (number_length == 7): number_text = number_text[0:1] + ',' + number_text[1:4] + ',' + number_text[4:] elif (number_length == 8): number_text = number_text[0:2] + ',' + number_text[2:5] + ',' + number_text[5:] elif (number_length == 9): number_text = number_text[0:3] + ',' + number_text[3:6] + ',' + number_text[6:] return number_text def check_digits (URL_in): """ This function checks to see if the referrer URL is an IP address """ pos1 = URL_in.find('//') + 2 pos2 = URL_in.find('/', pos1) part = URL_in[pos1:pos2] flag = "T" for char in part: if (char == '.'): continue elif (char == ':'): continue elif (char.isdigit() == 1): continue else: flag = "F" break return flag def check_local (URL_in, URL_out): """ This function checks to see if the referrer URL is from the local domain.""" f_in = find_filename(URL_in) f_out = find_filename(URL_out) if (f_in == f_out): return "local" else: return "referrer" def clean_url (URL_in): """ This function cleans up a referrer URL before we store it in the file.""" # Determine the number of forward slashes in the URL slash_count = URL_in.count('/') # Standardize search engine URLs if (URL_in.find('images.google') >= 0): URL_in = 'http://images.google.com/' elif (URL_in.find('google') >= 0): URL_in = 'http://www.google.com/' if (URL_in.find('images.search.yahoo') >= 0): URL_in = 'http://images.search.yahoo.com/' elif (URL_in.find('search.yahoo') >= 0): URL_in = 'http://search.yahoo.com/' if (URL_in.find('altavista.com/image') >= 0): URL_in = 'http://www.altavista.com/image/' elif (URL_in.find('altavista.com') >= 0): URL_in = 'http://www.altavista.com/' if (URL_in.find('pictures.ask.com') >= 0): URL_in = 'http://pictures.ask.com/' elif (URL_in.find('ask.com') >= 0): URL_in = 'http://www.ask.com/' if (URL_in.find('aol.com') >= 0): URL_in = 'http://search.aol.com/' if (URL_in.find('search.msn.com') >= 0): URL_in = 'http://search.msn.com/' if (URL_in.find('answers.com') >= 0): URL_in = 'http://www.answers.com/' if (URL_in.find('excite.com') >= 0): URL_in = 'http://www.excite.com/' if (URL_in.find('dogpile.com') >= 0): URL_in = 'http://www.dogpile.com/' if (URL_in.find('use.com') >= 0): URL_in = 'http://www.use.com/' if (URL_in.find('a9.com') >= 0): URL_in = 'http://www.a9.com/' if (URL_in.find('alltheweb.com') >= 0): URL_in = 'http://www.alltheweb.com/' if (URL_in.find('att.net') >= 0): URL_in = 'http://www.att.net/' if (URL_in.find('avantfind.com') >= 0): URL_in = 'http://www.avantfind.com/' if (URL_in.find('chacha.com') >= 0): URL_in = 'http://search.chacha.com/' if (URL_in.find('crawler.com') >= 0): URL_in = 'http://www.crawler.com/' if (URL_in.find('comcast.net') >= 0): URL_in = 'http://www.comcast.net/' if (URL_in.find('earthlink.net') >= 0): URL_in = 'http://search.earthlink.net/' if (URL_in.find('foxnews.com') >= 0): URL_in = 'http://search.foxnews.com/' if (URL_in.find('infospace.com') >= 0): URL_in = 'http://www.infospace.com/home/search/' if (URL_in.find('live.com') >= 0): URL_in = 'http://www.live.com/' if (URL_in.find('mamma.com') >= 0): URL_in = 'http://www.mamma.com/' if (URL_in.find('metacrawler.com') >= 0): URL_in = 'http://www.metacrawler.com/' if (URL_in.find('mysearch.com') >= 0): URL_in = 'http://www.mysearch.com/' if (URL_in.find('mysearch.myway.com') >= 0): URL_in = 'http://www.mysearch.com/' if (URL_in.find('myway.com') >= 0): URL_in = 'http://search.myway.com/' if (URL_in.find('mywebsearch.com') >= 0): URL_in = 'http://www.mywebsearch.com/' if (URL_in.find('oveture.com') >= 0): URL_in = 'http://www.content.overture.com/' if (URL_in.find('oxysearch.com') >= 0): URL_in = 'http://search.oxysearch.com/' if (URL_in.find('redzip.com') >= 0): URL_in = 'http://www.redzip.com/' if (URL_in.find('.rr.com') >= 0): URL_in = 'http://www.rr.com/' if (URL_in.find('search.com') >= 0): URL_in = 'http://www.search.com/' if (URL_in.find('searchalot.com') >= 0): URL_in = 'http://www.searchalot.com/' if (URL_in.find('searchxyz.com') >= 0): URL_in = 'http://www.searchxyz.com/' if (URL_in.find('stumbleupon.com') >= 0): URL_in = 'http://www.stumbleupon.com/' if (URL_in.find('web.info.com') >= 0): URL_in = 'http://www.infospace.com/home/search/' if (URL_in.find('webcrawler.com') >= 0): URL_in = 'http://www.webcrawler.com/' if (URL_in.find('websearch.com') >= 0): URL_in = 'http://www.websearch.com/' if (URL_in.find('windstream.net') >= 0): URL_in = 'http://www.windstream.net/' # Change http://name.com/ to http://www.name.com/ pos1 = URL_in.find('//') + 2 pos2 = URL_in.find('/', pos1) part = URL_in[pos1:pos2] list = part.split('.') if (len(list) == 2): URL_in = URL_in[:pos1] + 'www.' + URL_in[pos1:] # Remove CGI variables from referrer URL pos = URL_in.find('?') if (pos >= 0): URL_in = URL_in[0:pos] # Remove index.? from the end of the referrer URL pos1 = URL_in.rfind('/') pos2 = URL_in.find('/index.', pos1) if (pos2 >= 0): if (pos2 == pos1): URL_in = URL_in[:pos2] return URL_in def extract_data (in_string, start, end, pos): """ This function extracts keyword data from a string. """ slen = len(start) spos = in_string.find(start, pos) epos = in_string.find(end, spos+slen) if (spos >= 0): if (epos >= 0): extract = in_string[spos+slen:epos] else: extract = in_string[spos+slen:] else: extract = None return extract def find_filename (URL_out): """ This function gets the domain from the URL.""" http_start = 'http://' www_start = 'www.' http_start_length = len(http_start) www_start_length = len(www_start) i = URL_out.find(http_start) k = URL_out.find(www_start) j = URL_out.find('/', i+http_start_length) if (k >= 0): if (j >= 0): s_out = URL_out[k+www_start_length:j] else: s_out = URL_out[k+www_start_length:] else: if (j >= 0): s_out = URL_out[i+http_start_length:j] else: s_out = URL_out[i+http_start_length:] return s_out def get_page_title (URL_in): """ This function gets a page title from the HTML returned by the URL. If no page title exists, the URL is truncated and returned as the title.""" # print URL_in title_start_tag = '' title_start_tag_length = len(title_start_tag) title_end_tag = '' try: f = urllib.urlopen(URL_in) except IOError: title = find_filename(URL_in) return ("F", title) g = f.read() f.close() type = "T" title = extract_data(g, title_start_tag, title_end_tag, 0) if (title == ""): title = find_filename(URL_in) type = "F" elif (title == None): title = extract_data(g, title_start_tag.upper(), title_end_tag.upper(), 0) if (title == None) or (title == ""): title = find_filename(URL_in) type = "F" # Remove any whitespace, including spaces, tabs, and newlines title = title.strip() return (type, title) def show_data (URL_out, minimum, expires, limit): """ This function generates valid HTML in JavaScript from the referrer statistics in the database Results 0 - Referrer Title Results 1 - Referrer URL Results 2 - Referrer Count """ domain = find_filename(URL_out) results = SelectReferrer(domain, minimum, expires, limit) #print results HTML_string = 'function referrals() {' + os.linesep HTML_string += ' document.write("
Referring Web Pages, last ' HTML_string += str(expires) + ' days:
");' + os.linesep if (len(results) == 0): HTML_string += ' document.write("
No referrals' HTML_string += ' for this web site yet.
");' + os.linesep for field in results: title = field[0] title = title.replace('"', '\\"') title = title.replace("'", "\\'") url = field[1] url = url.replace('"', '\\"') url = url.replace("'", "\\'") HTML_string += ' document.write("
' + title + ' [' + AddCommas(field[2]) HTML_string += ']
");' + os.linesep HTML_string += ' document.write("
' HTML_string += 'List referrers to your own' HTML_string += ' site (free!)
");' + os.linesep HTML_string += '}' + os.linesep HTML_string += 'referrals();' + os.linesep #print HTML_string return HTML_string def update_data (URL_in, URL_out): """ This function adds a referrer to the MySQL database, if the referrer as well as the location passes the tests.""" if (URL_in == "") or (URL_in == " "): return if (URL_out == "") or (URL_out == " "): return if (URL_in.find('http://') != 0): return if (URL_out.find('http://') != 0): return if (check_digits (URL_in) == "T"): return if (check_digits (URL_out) == "T"): return if (check_local (URL_in, URL_out) == "local"): return f_in = clean_url(URL_in) f_out = find_filename(URL_out) title = get_page_title(f_in) UpdateReferrer(f_out, f_in, title) #print f_in, f_out, title return