{{{ #-*-coding:utf8-*- import re, sys, math classlist = ["economy","politics"] maketestdir = lambda i : "/home/newmoni/workspace/svm/package/test/"+i+"/"+i+".txt" def readtrain(): path1 = "/home/newmoni/workspace/svm/package/train/economy/index.economy.db" path2 = "/home/newmoni/workspace/svm/package/train/politics/index.politics.db" makedir = lambda i : "/home/newmoni/workspace/svm/package/train/"+i+"/index."+i+".db" classfreqdic = {} totalct=0 wordfreqdic = {} for eachclass in classlist: doclist = open(makedir(eachclass)).read().split("\n") classfreqdic[eachclass]=len(doclist) wordfreqdic[eachclass] = {} totalct+=len(doclist) for line in doclist: for word in line.split(" "): if not wordfreqdic[eachclass].has_key(word): wordfreqdic[eachclass][word]=0 wordfreqdic[eachclass][word]+=1 totalct = float(totalct) prob1 = math.log((classfreqdic["economy"]/totalct)/(classfreqdic["politics"]/totalct)) classprob1 = float(classfreqdic["economy"]/totalct) classprob2 = float(classfreqdic["politics"]/totalct) return classfreqdic, wordfreqdic, prob1, classprob1, classprob2 def classifydocument(document): totalprob = 0 for word in document.replace("\n"," ").split(" "): classfreq1 = wordfreqdic["economy"].get(word,0)+1 classfreq2 = wordfreqdic["politics"].get(word,0)+1 totalprob+= math.log((classfreq1/classprob1)/(classfreq2/classprob2)) return totalprob if __name__=="__main__": classfreqdic, wordfreqdic, prob1, classprob1, classprob2 = readtrain() correctct=0 totalct=0 for eachclass in classlist: doclist = open(maketestdir(eachclass)).read().split("\n") for line in doclist: totalprob = classifydocument(line) print eachclass, totalprob if eachclass=="economy": if totalprob>0: correctct+=1 elif eachclass=="politics": if totalprob<0: correctct+=1 totalct+=1 print correctct,totalct, correctct/float(totalct) }}}