Plex λ‘ Wiki Page Parser λ₯Ό λ§λ€λμ€. Plex λ μμ£Ό νλ₯ν readability μ lexical analyzer code λ₯Ό λ§λ€λλ‘ λμμ€λ€.
Plex Example : Wiki Parser ¶
νμ¬ PyKiλΌλ, 1002κ° κ°μΈμ μΌλ‘ λ§λ€μ΄μ μ¬μ©μ€μΈ μν€μμμ parser ν΄λμ€ μ€ μΌλΆ μ½λμ΄λ€.
{{{~cpp
from Plex import *
import cStringIO, StringIO
class Parser:
def __init__(self, anInterWikiMap={}, aScriptName='', aMacros={}):
self.interWikiMap = anInterWikiMap
self.scriptName = aScriptName
self.macros = aMacros
def makeToStream(self, aText):
return cStringIO.StringIO(aText)
def parse(self, aText):
stream = self.makeToStream(aText)
return WikiParser(stream, self.interWikiMap,self.scriptName, self.macros).linkedLine()
class WikiParser(Scanner):
def repl_bold(self, aText):
self.bold = not self.bold
return ("<B>","</B>")[not self.bold]
def repl_italic(self, aText):
self.italic = not self.italic
return ("<I>","</I>")[not self.italic]
def repl_boldAndItalic(self, aText):
self.italic = not self.italic
return ("<I><B>","</B></I>")[not self.italic]
def repl_ruler(self, aText):
return "<HR>\n"
def repl_enter(self, aText):
return "<BR>\n"
def getLinkStr(self, anUrl):
return "<a href='%(url)s'>%(url)s</a>" % {'url':anUrl}
def getImageStr(self, anUrl):
return "<img src='%(url)s'>" % {'url':anUrl}
def isImage(self, aFileUrl):
imgExtensions = ("jpg", "png", "gif", "jpeg")
extension = aFileUrl[aFileUrl.rfind(".")+1:]
return extension in imgExtensions
def repl_url(self, aText):
if self.isImage(aText):
return self.getImageStr(aText)
return self.getLinkStr(aText)
def repl_interwiki(self, aText):
wikiMapName,pageName = self.text.split(":")
if self.interWikiMap.has_key(wikiMapName):
url = self.interWikiMap[wikiMapName]
imageTag = "<IMG SRC=../intertag.gif BORDER=0>"
formatText = "<A HREF=%(url)s>%(imageTag)s</A><A HREF=%(url)s%(pageName)s>%(pageName)s</A>"
else:
url = self.scriptName+"/InterMap"
imageTag = "<IMG SRC=../intertag.gif BORDER=1 ALT='Bad:%s'>"%(wikiMapName)
formatText="<A HREF=%(url)s>%(imageTag)s</A><A HREF=%(url)s>%(pageName)s</A>"
return formatText % {'url':url, 'pageName':pageName, 'imageTag':imageTag}
def repl_pagelink(self, aText):
if aText == "NonExistPage":
return "<a class=nonexist href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" % {'scriptName':self.scriptName, 'pageName':aText}
return "<a href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" \
% {'scriptName':self.scriptName, 'pageName':aText}
def repl_pagelinkUsingUnderbar(self, aText):
pageName = aText[:-1]
return "<a href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" % {'scriptName':self.scriptName, 'pageName':pageName}
def repl_macro(self, aText):
class NoneMacro:
def execute(self):
return ''
macroName = aText[2:-2]
macro=self.macros.get(macroName) or NoneMacro()
return macro.execute()
def repl_normalString(self, aText):
return aText
# for auto link
urlHeader = Str("http://")
stringUntilSpace = Rep(AnyBut(" "))
url = urlHeader + stringUntilSpace
upperCase = Range('AZ')
lowerCase = Range('az')
upperCaseSequence = Rep1(upperCase)
lowerCaseSequence = Rep1(lowerCase)
alphabetSequence = Rep1(upperCaseSequence | lowerCaseSequence)
interwikiDelim = Str(":")
stringUntilSpaceOrCommaOrRest = Rep1(AnyBut(" .,"))
interwiki = alphabetSequence + interwikiDelim + stringUntilSpaceOrCommaOrRest
pagelinkUsingCamelWord = upperCase + Rep(lowerCase) + Rep1(upperCaseSequence + lowerCaseSequence)
underbarForLink = Str('_')
pagelinkUsingUnderbar = Rep1(AnyBut('{\n_ ')) + underbarForLink
macro = Str('[[') + Rep1(AnyBut(']]')) + Str(']]')
space = Str(" ")
# for tag
bold = Str("'''")
italic = Str("''")
boldanditalic = bold + italic
htmllefttag = Str("<")
htmlrighttag = Str(">")
htmlandtag = Str("&")
enterCode = Str("\n")
ruler = Str("----") + enterCode
rawTextStart=Str("{{{~cpp ")
rawTextEnd=Str("} } }")
def repl_rawTextStart(self, aText):
self.begin("rawtext")
return "<PRE>"
def repl_rawTextEnd(self, aText):
self.begin("")
return "</PRE>"
lexicon = Lexicon([
(rawTextStart, repl_rawTextStart),
State('rawtext', [
(rawTextEnd, repl_rawTextEnd),
(AnyChar, repl_normalString),
]),
(italic, repl_italic),
(bold, repl_bold),
(boldanditalic, repl_boldAndItalic),
(htmllefttag, "<"),
(htmlrighttag, ">"),
(htmlandtag, "&"),
(url, repl_url),
(interwiki, repl_interwiki),
(pagelinkUsingCamelWord, repl_pagelink),
(ruler, repl_ruler),
(enterCode, repl_enter),
(pagelinkUsingUnderbar, repl_pagelinkUsingUnderbar),
(macro, repl_macro),
(AnyChar | space, repl_normalString),])
def __init__(self, aStream, anInterWikiMap={}, aScriptName='pyki.cgi', aMacroList={}):
Scanner.__init__(self, self.lexicon, aStream)
self.interWikiMap=anInterWikiMap
self.scriptName = aScriptName
self.macros = aMacroList
self.bold = False
self.italic = False
def linkedLine(self):
writer = StringIO.StringIO("")
while True:
token = self.read()
if token[0] is None:
break
writer.write(token[0])
return writer.getvalue()
}}}
=== κ°λ°μ€ μ΄μ ===
μ²μμλ Wiki μμ Tag μ λν΄ Tagger ν΄λμ€λ₯Ό λ§λ€κ³ , link λ₯Ό κ±Έμ΄μ£Όλ λΆλΆμ λν΄ AutoLinker λ₯Ό, Macro μλ MacroApplyer λ₯Ό κ°κ° λ§λ€μ΄μ£Όμλ€. κ·Έλ¬λ€κ° λ¬Έμ κ° μκ²Όλλ°, νκ·Έμ€μ κ·Έ μν₯λ ₯μ΄ κ²ΉμΉλ λΆλΆμ΄ μ겨λκ² λ κ²μ΄λ€. μ¦, μλ₯Ό λ λ€λ©΄ Macro μ κ²½μ° CamelWord λ‘ μ΄λ¦μ§μ΄μ§κΈ°λ νλλ°, μ΄λ AutoLinker μ apply λ₯Ό κ±°μΉλ©΄μ archor νκ·Έκ° λΆμ΄λ²λ¦¬λ κ²μ΄λ€.
ν΄κ²°λ°©λ² : λκ°μ§μΈλ°, νλλ AutoLinker μμ Macro κ΄λ ¨ νκ·Έμ 무μνκ³ μ§λκ°λ λ°©λ²μ΄κ³ νλλ AutoLinker μ MacroApplyerλ₯Ό ν΅ν©νλ λ°©λ²μ΄λ€.
μ μμ κ²½μ° κ°κ°μ Class Responsibility λ€μ μ μ§νλ€λ μ₯μ μ΄ μμ§λ§, AutoLinker μμ μλ μκ°μΉ μμλ νκ°μ§ μΌμ λ ν΄μΌ νλ€λ μ μ΄ μκ² λ€.
νμμ κ²½μ° ν΄λμ€κ° 컀μ§λ€λ λ¨μ μ΄ μμ§λ§, μλν lexical λ€λ§ ννλλ€λ μ κ³Ό 1 pass λ‘ νμ±μ΄ κ°μ΄ μ΄λ£¨μ΄μ§ μ μλ€λ μ₯μ μ΄ μλ€.
κ²°κ΅μ νμλ₯Ό μ ννμλ€. κ·Όλ°, κ·Έλ¬λ©΄μ μ΄λ²μ Tagger μ AutoLinker μμͺ½μ μν₯λ ₯μ λ―ΈμΉ κ±°λ¦¬κ° μκ²Όλλ°, λ°λ‘ ν
μ€νΈλ₯Ό κ·Έλλ‘ λ³΄μ¬μ£Όλ νκ·ΈλΆλΆμ΄λ€.
κ·Έλ¬λ......~ νμλ‘ μμ νλλ° 40λΆλ μ걸리λ€.; μμ
μΌλ‘ λ³Έλ€λ©΄ Parser λκ°μ lexicon μ ν©μΉλ μμ
μμλ λΆκ΅¬νκ³ , κ·Έ μμ μ±λ 보μ₯λ°μΌλ©΄μ parser μμ line λ¨μ μλ₯΄κΈ° λΆλΆκΉμ§ μμ νμλ€. λ§€ λ² μμ ν λλ§λ€ ν
μ€νΈλ₯Ό λ리면μ νμΈνκΈ° λλ¬Έμ κ·Έ κ²°κ³Όκ° λ³΄μ₯μ΄ λμλ€. Text Processing μμ μ΄λ¬ν λΆλΆμ λν TDDμ νμλ μ λ§ ν¬λ€λ μκ°μ΄ λ λ€.
----
[Plex]








