Bidplaza Statistics - bpreloaddatabase.py

bpreloaddatabase.py

Caricato da: Piero Tofy
Scarica il programma completo


	#!/usr/bin/python #Author: Piero Toffanin #BidPlaza Statistics 1.0 #Reload offerts database module import urllib, re, os, string, bprebuildindex bidplazaAuditionUrl = "http://www.bidplaza.it/t_auction_object_ended_view.php?bidauction_id=" bidplazaClosedAuditionsUrl = "http://www.bidplaza.it/t_auctions_ended.php?from=" ClosedAuditionsMultiplier = 40 def startDatabaseReloadProcess(): print "Database reload process started..." auditionsIds = getAuditionIDs() fetched = [] for auid in auditionsIds: if not auid in fetched: fetchAudition(auid) fetched.append(auid) bprebuildindex.rebuildIndex() print "Database reload process completed!" def startDatabaseUpdateProcess(): print "Database update process started..." auditionsIds = getAuditionIDs() fetched = [] for auid in auditionsIds: if not auid in fetched and not os.path.exists("odb/" + str(auid)+".dat"): fetchAudition(auid) fetched.append(auid) bprebuildindex.rebuildIndex() print "Database update process completed!" def fetchAudition(AuditionId): print "Fetching data for AuditionId " + str(AuditionId) + "(" + bidplazaAuditionUrl + str(AuditionId) + ")" #Fetch data data = urllib.urlopen(bidplazaAuditionUrl + str(AuditionId)).read() #Find audition product name audProdName = string.replace(re.findall("<title>([^<]+)</title>",data)[0],"Bidplaza - ","") #Read regex string from file bidplazaOffertsRegex = open("offertsregex.dat").read() #Find matches matches = re.findall(bidplazaOffertsRegex,data) #Writes them on file print "Writing results for AuditionId " + str(AuditionId) fd = os.open("odb/" + str(AuditionId)+".dat",os.O_WRONLY \| os.O_TRUNC \| os.O_CREAT) #First though we need to find: # - Total offerts (real offerts + heuristic ones) # - Winning offert (unique and minimum) # - Missing data (0 = no, heuristic ones = yes) # - Average bet for offert # Heuristic = Minimum offert / ((Maximum - Minimum)/Real offerts) winningoffert = "0" for item in matches: if item[1] == "1": winningoffert = item[0] break if matches[0][0] == "1": heuristic = 0 else: heuristic = int(float(matches[0][0]) / ((float(matches[-1][0]) - float(matches[0][0])) / len(matches)) - 1) totalOfferts = len(matches) + heuristic average = 0 for item in matches: average += int(item[1]) average = round(float(average) / float(len(matches)),4) last = 0 os.write(fd,audProdName + "\n" + str(totalOfferts) + "\n" + winningoffert + "\n" + str(heuristic) + "\n" + str(average) + "\n") for offertcount in matches: if int(offertcount[0]) > last: os.write(fd,offertcount[0] + " " + offertcount[1] + "\n") last = int(offertcount[0]) else: #Second column, skip please break os.close(fd) def getAuditionIDs(): c = 0 validPage = True ret = [] while(validPage): print "Fetching IDs from page " + str(c+1) + " ("+str(bidplazaClosedAuditionsUrl + str(ClosedAuditionsMultiplier * c))+")" data = urllib.urlopen(bidplazaClosedAuditionsUrl + str(ClosedAuditionsMultiplier * c)).read() p = re.findall("t_auction_object_ended_view\.php\?bidauction_id=([0-9]+)",data) ret = ret + p if len(re.findall("t_auctions_ended\.php\?from=([0-9]+)",data)) == 0: validPage = False else: c+=1 return ret