#-*-coding:utf8-*-
import re, sys, math
classlist = ["economy","politics"]
maketestdir = lambda i : "/home/newmoni/workspace/svm/package/test/"+i+"/"+i+".txt"
def readtrain():
path1 = "/home/newmoni/workspace/svm/package/train/economy/index.economy.db"
path2 = "/home/newmoni/workspace/svm/package/train/politics/index.politics.db"
makedir = lambda i : "/home/newmoni/workspace/svm/package/train/"+i+"/index."+i+".db"
classfreqdic = {}
totalct=0
wordfreqdic = {}
for eachclass in classlist:
doclist = open(makedir(eachclass)).read().split("\n")
classfreqdic[eachclass]=len(doclist)
wordfreqdic[eachclass] = {}
totalct+=len(doclist)
for line in doclist:
for word in line.split(" "):
if not wordfreqdic[eachclass].has_key(word):
wordfreqdic[eachclass][word]=0
wordfreqdic[eachclass][word]+=1
totalct = float(totalct)
prob1 = math.log((classfreqdic["economy"]/totalct)/(classfreqdic["politics"]/totalct))
classprob1 = float(classfreqdic["economy"]/totalct)
classprob2 = float(classfreqdic["politics"]/totalct)
return classfreqdic, wordfreqdic, prob1, classprob1, classprob2
def classifydocument(document):
totalprob = 0
for word in document.replace("\n"," ").split(" "):
classfreq1 = wordfreqdic["economy"].get(word,0)+1
classfreq2 = wordfreqdic["politics"].get(word,0)+1
totalprob+= math.log((classfreq1/classprob1)/(classfreq2/classprob2))
return totalprob
if __name__=="__main__":
classfreqdic, wordfreqdic, prob1, classprob1, classprob2 = readtrain()
correctct=0
totalct=0
for eachclass in classlist:
doclist = open(maketestdir(eachclass)).read().split("\n")
for line in doclist:
totalprob = classifydocument(line)
print eachclass, totalprob
if eachclass=="economy":
if totalprob>0:
correctct+=1
elif eachclass=="politics":
if totalprob<0:
correctct+=1
totalct+=1
print correctct,totalct, correctct/float(totalct)