Plex λ‘ Wiki Page Parser λ₯Ό λ§λ€λμ€. Plex λ μμ£Ό νλ₯ν readability μ lexical analyzer code λ₯Ό λ§λ€λλ‘ λμμ€λ€.
Plex Example : Wiki Parser ¶
νμ¬ PyKiλΌλ, 1002κ° κ°μΈμ μΌλ‘ λ§λ€μ΄μ μ¬μ©μ€μΈ μν€μμμ parser ν΄λμ€ μ€ μΌλΆ μ½λμ΄λ€.
{{{~cpp from Plex import * import cStringIO, StringIO class Parser: def __init__(self, anInterWikiMap={}, aScriptName='', aMacros={}): self.interWikiMap = anInterWikiMap self.scriptName = aScriptName self.macros = aMacros def makeToStream(self, aText): return cStringIO.StringIO(aText) def parse(self, aText): stream = self.makeToStream(aText) return WikiParser(stream, self.interWikiMap,self.scriptName, self.macros).linkedLine() class WikiParser(Scanner): def repl_bold(self, aText): self.bold = not self.bold return ("<B>","</B>")[not self.bold] def repl_italic(self, aText): self.italic = not self.italic return ("<I>","</I>")[not self.italic] def repl_boldAndItalic(self, aText): self.italic = not self.italic return ("<I><B>","</B></I>")[not self.italic] def repl_ruler(self, aText): return "<HR>\n" def repl_enter(self, aText): return "<BR>\n" def getLinkStr(self, anUrl): return "<a href='%(url)s'>%(url)s</a>" % {'url':anUrl} def getImageStr(self, anUrl): return "<img src='%(url)s'>" % {'url':anUrl} def isImage(self, aFileUrl): imgExtensions = ("jpg", "png", "gif", "jpeg") extension = aFileUrl[aFileUrl.rfind(".")+1:] return extension in imgExtensions def repl_url(self, aText): if self.isImage(aText): return self.getImageStr(aText) return self.getLinkStr(aText) def repl_interwiki(self, aText): wikiMapName,pageName = self.text.split(":") if self.interWikiMap.has_key(wikiMapName): url = self.interWikiMap[wikiMapName] imageTag = "<IMG SRC=../intertag.gif BORDER=0>" formatText = "<A HREF=%(url)s>%(imageTag)s</A><A HREF=%(url)s%(pageName)s>%(pageName)s</A>" else: url = self.scriptName+"/InterMap" imageTag = "<IMG SRC=../intertag.gif BORDER=1 ALT='Bad:%s'>"%(wikiMapName) formatText="<A HREF=%(url)s>%(imageTag)s</A><A HREF=%(url)s>%(pageName)s</A>" return formatText % {'url':url, 'pageName':pageName, 'imageTag':imageTag} def repl_pagelink(self, aText): if aText == "NonExistPage": return "<a class=nonexist href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" % {'scriptName':self.scriptName, 'pageName':aText} return "<a href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" \ % {'scriptName':self.scriptName, 'pageName':aText} def repl_pagelinkUsingUnderbar(self, aText): pageName = aText[:-1] return "<a href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" % {'scriptName':self.scriptName, 'pageName':pageName} def repl_macro(self, aText): class NoneMacro: def execute(self): return '' macroName = aText[2:-2] macro=self.macros.get(macroName) or NoneMacro() return macro.execute() def repl_normalString(self, aText): return aText # for auto link urlHeader = Str("http://") stringUntilSpace = Rep(AnyBut(" ")) url = urlHeader + stringUntilSpace upperCase = Range('AZ') lowerCase = Range('az') upperCaseSequence = Rep1(upperCase) lowerCaseSequence = Rep1(lowerCase) alphabetSequence = Rep1(upperCaseSequence | lowerCaseSequence) interwikiDelim = Str(":") stringUntilSpaceOrCommaOrRest = Rep1(AnyBut(" .,")) interwiki = alphabetSequence + interwikiDelim + stringUntilSpaceOrCommaOrRest pagelinkUsingCamelWord = upperCase + Rep(lowerCase) + Rep1(upperCaseSequence + lowerCaseSequence) underbarForLink = Str('_') pagelinkUsingUnderbar = Rep1(AnyBut('{\n_ ')) + underbarForLink macro = Str('[[') + Rep1(AnyBut(']]')) + Str(']]') space = Str(" ") # for tag bold = Str("'''") italic = Str("''") boldanditalic = bold + italic htmllefttag = Str("<") htmlrighttag = Str(">") htmlandtag = Str("&") enterCode = Str("\n") ruler = Str("----") + enterCode rawTextStart=Str("{{{~cpp ") rawTextEnd=Str("} } }") def repl_rawTextStart(self, aText): self.begin("rawtext") return "<PRE>" def repl_rawTextEnd(self, aText): self.begin("") return "</PRE>" lexicon = Lexicon([ (rawTextStart, repl_rawTextStart), State('rawtext', [ (rawTextEnd, repl_rawTextEnd), (AnyChar, repl_normalString), ]), (italic, repl_italic), (bold, repl_bold), (boldanditalic, repl_boldAndItalic), (htmllefttag, "<"), (htmlrighttag, ">"), (htmlandtag, "&"), (url, repl_url), (interwiki, repl_interwiki), (pagelinkUsingCamelWord, repl_pagelink), (ruler, repl_ruler), (enterCode, repl_enter), (pagelinkUsingUnderbar, repl_pagelinkUsingUnderbar), (macro, repl_macro), (AnyChar | space, repl_normalString),]) def __init__(self, aStream, anInterWikiMap={}, aScriptName='pyki.cgi', aMacroList={}): Scanner.__init__(self, self.lexicon, aStream) self.interWikiMap=anInterWikiMap self.scriptName = aScriptName self.macros = aMacroList self.bold = False self.italic = False def linkedLine(self): writer = StringIO.StringIO("") while True: token = self.read() if token[0] is None: break writer.write(token[0]) return writer.getvalue() }}} === κ°λ°μ€ μ΄μ === μ²μμλ Wiki μμ Tag μ λν΄ Tagger ν΄λμ€λ₯Ό λ§λ€κ³ , link λ₯Ό κ±Έμ΄μ£Όλ λΆλΆμ λν΄ AutoLinker λ₯Ό, Macro μλ MacroApplyer λ₯Ό κ°κ° λ§λ€μ΄μ£Όμλ€. κ·Έλ¬λ€κ° λ¬Έμ κ° μκ²Όλλ°, νκ·Έμ€μ κ·Έ μν₯λ ₯μ΄ κ²ΉμΉλ λΆλΆμ΄ μ겨λκ² λ κ²μ΄λ€. μ¦, μλ₯Ό λ λ€λ©΄ Macro μ κ²½μ° CamelWord λ‘ μ΄λ¦μ§μ΄μ§κΈ°λ νλλ°, μ΄λ AutoLinker μ apply λ₯Ό κ±°μΉλ©΄μ archor νκ·Έκ° λΆμ΄λ²λ¦¬λ κ²μ΄λ€. ν΄κ²°λ°©λ² : λκ°μ§μΈλ°, νλλ AutoLinker μμ Macro κ΄λ ¨ νκ·Έμ 무μνκ³ μ§λκ°λ λ°©λ²μ΄κ³ νλλ AutoLinker μ MacroApplyerλ₯Ό ν΅ν©νλ λ°©λ²μ΄λ€. μ μμ κ²½μ° κ°κ°μ Class Responsibility λ€μ μ μ§νλ€λ μ₯μ μ΄ μμ§λ§, AutoLinker μμ μλ μκ°μΉ μμλ νκ°μ§ μΌμ λ ν΄μΌ νλ€λ μ μ΄ μκ² λ€. νμμ κ²½μ° ν΄λμ€κ° 컀μ§λ€λ λ¨μ μ΄ μμ§λ§, μλν lexical λ€λ§ ννλλ€λ μ κ³Ό 1 pass λ‘ νμ±μ΄ κ°μ΄ μ΄λ£¨μ΄μ§ μ μλ€λ μ₯μ μ΄ μλ€. κ²°κ΅μ νμλ₯Ό μ ννμλ€. κ·Όλ°, κ·Έλ¬λ©΄μ μ΄λ²μ Tagger μ AutoLinker μμͺ½μ μν₯λ ₯μ λ―ΈμΉ κ±°λ¦¬κ° μκ²Όλλ°, λ°λ‘ ν μ€νΈλ₯Ό κ·Έλλ‘ λ³΄μ¬μ£Όλ νκ·ΈλΆλΆμ΄λ€. κ·Έλ¬λ......~ νμλ‘ μμ νλλ° 40λΆλ μ걸리λ€.; μμ μΌλ‘ λ³Έλ€λ©΄ Parser λκ°μ lexicon μ ν©μΉλ μμ μμλ λΆκ΅¬νκ³ , κ·Έ μμ μ±λ 보μ₯λ°μΌλ©΄μ parser μμ line λ¨μ μλ₯΄κΈ° λΆλΆκΉμ§ μμ νμλ€. λ§€ λ² μμ ν λλ§λ€ ν μ€νΈλ₯Ό λ리면μ νμΈνκΈ° λλ¬Έμ κ·Έ κ²°κ³Όκ° λ³΄μ₯μ΄ λμλ€. Text Processing μμ μ΄λ¬ν λΆλΆμ λν TDDμ νμλ μ λ§ ν¬λ€λ μκ°μ΄ λ λ€. ---- [Plex]