No difference found
#-*-coding:utf8-*-
import re, sys, math
classlist = ["economy","politics"]
maketestdir = lambda i : "/home/newmoni/workspace/svm/package/test/"+i+"/"+i+".txt"
def readtrain():
path1 = "/home/newmoni/workspace/svm/package/train/economy/index.economy.db"
path2 = "/home/newmoni/workspace/svm/package/train/politics/index.politics.db"
makedir = lambda i : "/home/newmoni/workspace/svm/package/train/"+i+"/index."+i+".db"
classfreqdic = {}
totalct=0
wordfreqdic = {}
for eachclass in classlist:
doclist = open(makedir(eachclass)).read().split("\n")
classfreqdic[eachclass]=len(doclist)
wordfreqdic[eachclass] = {}
totalct+=len(doclist)
for line in doclist:
for word in line.split(" "):
if not wordfreqdic[eachclass].has_key(word):
wordfreqdic[eachclass][word]=0
wordfreqdic[eachclass][word]+=1
totalct = float(totalct)
prob1 = math.log((classfreqdic["economy"]/totalct)/(classfreqdic["politics"]/totalct))
classprob1 = float(classfreqdic["economy"]/totalct)
classprob2 = float(classfreqdic["politics"]/totalct)
return classfreqdic, wordfreqdic, prob1, classprob1, classprob2
def classifydocument(document):
totalprob = 0
for word in document.replace("\n"," ").split(" "):
classfreq1 = wordfreqdic["economy"].get(word,0)+1
classfreq2 = wordfreqdic["politics"].get(word,0)+1
totalprob+= math.log((classfreq1/classprob1)/(classfreq2/classprob2))
return totalprob
if __name__=="__main__":
classfreqdic, wordfreqdic, prob1, classprob1, classprob2 = readtrain()
correctct=0
totalct=0
for eachclass in classlist:
doclist = open(maketestdir(eachclass)).read().split("\n")
for line in doclist:
totalprob = classifydocument(line)
print eachclass, totalprob
if eachclass=="economy":
if totalprob>0:
correctct+=1
elif eachclass=="politics":
if totalprob<0:
correctct+=1
totalct+=1
print correctct,totalct, correctct/float(totalct)
#-*-coding:utf8-*-
import re, sys, math
from NaiveBayesian import *
if __name__=="__main__":
classfreqdic, wordfreqdic, prob1, classprob1, classprob2 = readtrain()
print "read end "
wordlist = set()
for eachclass in classlist:
wordlist.update(wordfreqdic[eachclass].keys())
print "end"
wordlist = list(wordlist)
path1 = "/home/newmoni/workspace/svm/package/train/economy/index.economy.db"
path2 = "/home/newmoni/workspace/svm/package/train/politics/index.politics.db"
makedir = lambda i : "/home/newmoni/workspace/svm/package/train/"+i+"/index."+i+".db"
classfreqdic = {}
totalct=0
wordfreqdic = {}
svmttest = "../data/test2.svm_light"
fout = file(svmttest,"a")
wordindexdic= {}
for idx,word in enumerate(wordlist):
wordindexdic[word]=(idx+1)
for idx,eachclass in enumerate(classlist):
doclist = open(maketestdir(eachclass)).read().split("\n")
print idx
classfreqdic[eachclass]=len(doclist)
wordfreqdic[eachclass] = {}
totalct+=len(doclist)
for line in doclist:
docwordfreq = {}
for word in line.split(" "):
if not wordindexdic.has_key(word):
continue
wordidx = wordindexdic[word]
if not docwordfreq.has_key(wordidx):
docwordfreq[wordidx]=0
docwordfreq[wordidx]+=1
docwordlist = docwordfreq.keys()
docwordlist.sort(reverse=False)
outlist = [str(idx+1)]
for wordidx in docwordlist:
outlist.append(str(wordidx)+":"+str(1))
# outlist.append(str(wordidx)+":"+str(docwordfreq[wordidx]))
print >>fout, " ".join(outlist)
# print " ".join(outlist)
# sys.exit()