E D R , A S I H C RSS

Building Wiki Parser Using Plex

Plex 둜 Wiki Page Parser λΌ λ§Œλ“€λ˜μ€‘. Plex λŠ” μ•„μ£Ό ν›Œλ₯­ν•œ readability 의 lexical analyzer code λΌ λ§Œλ“€λ„λ‘ 도와μ€λ‹€.

Plex Example : Wiki Parser

ν˜„μž¬ PyKiλΌλŠ”, 1002κ°€ 개인적으둜 λ§Œλ“€μ–΄μ„œ μ‚¬μš©μ€‘μΈ μœ„ν‚€μ—μ„œμ˜ parser 클래슀 쀑 일뢀 μ½”λ“œμ΄λ‹€.

{{{~cpp 
from Plex import *
import cStringIO, StringIO

class Parser:
    def __init__(self, anInterWikiMap={}, aScriptName='', aMacros={}):
        self.interWikiMap = anInterWikiMap
        self.scriptName = aScriptName
        self.macros = aMacros
    def makeToStream(self, aText):
        return cStringIO.StringIO(aText)
    def parse(self, aText):
        stream = self.makeToStream(aText)
        return WikiParser(stream, self.interWikiMap,self.scriptName, self.macros).linkedLine()

class WikiParser(Scanner):
    def repl_bold(self, aText):
        self.bold = not self.bold
        return ("<B>","</B>")[not self.bold]
    def repl_italic(self, aText):
        self.italic = not self.italic
        return ("<I>","</I>")[not self.italic]
    def repl_boldAndItalic(self, aText):
        self.italic = not self.italic
        return ("<I><B>","</B></I>")[not self.italic]
    def repl_ruler(self, aText):
        return "<HR>\n"
    def repl_enter(self, aText):
        return "<BR>\n"

    def getLinkStr(self, anUrl):
        return "<a href='%(url)s'>%(url)s</a>" % {'url':anUrl}
    def getImageStr(self, anUrl):
        return "<img src='%(url)s'>" % {'url':anUrl}
    def isImage(self, aFileUrl):
        imgExtensions = ("jpg", "png", "gif", "jpeg")
        extension = aFileUrl[aFileUrl.rfind(".")+1:]
        return extension in imgExtensions 

    def repl_url(self, aText):
        if self.isImage(aText):
            return self.getImageStr(aText)
        return self.getLinkStr(aText)
    def repl_interwiki(self, aText):
        wikiMapName,pageName = self.text.split(":")
        if self.interWikiMap.has_key(wikiMapName):
            url = self.interWikiMap[wikiMapName]
            imageTag = "<IMG SRC=../intertag.gif BORDER=0>"
            formatText = "<A HREF=%(url)s>%(imageTag)s</A><A HREF=%(url)s%(pageName)s>%(pageName)s</A>"
        else:
            url = self.scriptName+"/InterMap"
            imageTag = "<IMG SRC=../intertag.gif BORDER=1 ALT='Bad:%s'>"%(wikiMapName)
            formatText="<A HREF=%(url)s>%(imageTag)s</A><A HREF=%(url)s>%(pageName)s</A>"
        return  formatText % {'url':url, 'pageName':pageName, 'imageTag':imageTag}
    def repl_pagelink(self, aText):
        if aText == "NonExistPage":
            return "<a class=nonexist href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" % {'scriptName':self.scriptName, 'pageName':aText}
        return "<a href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" \
                % {'scriptName':self.scriptName, 'pageName':aText}
    def repl_pagelinkUsingUnderbar(self, aText):
        pageName = aText[:-1]
        return "<a href='%(scriptName)s/%(pageName)s'>%(pageName)s</a>" % {'scriptName':self.scriptName, 'pageName':pageName}

    def repl_macro(self, aText):
        class NoneMacro:
            def execute(self):
                return ''

        macroName = aText[2:-2]
        macro=self.macros.get(macroName) or NoneMacro()
        return macro.execute()

    def repl_normalString(self, aText):
        return aText

    # for auto link
    urlHeader = Str("http://")
    stringUntilSpace = Rep(AnyBut(" "))
    url = urlHeader + stringUntilSpace

    upperCase = Range('AZ') 
    lowerCase = Range('az') 
    upperCaseSequence = Rep1(upperCase)
    lowerCaseSequence = Rep1(lowerCase)
    alphabetSequence = Rep1(upperCaseSequence | lowerCaseSequence)

    interwikiDelim = Str(":")
    stringUntilSpaceOrCommaOrRest = Rep1(AnyBut(" .,"))
    interwiki = alphabetSequence + interwikiDelim + stringUntilSpaceOrCommaOrRest

    pagelinkUsingCamelWord = upperCase + Rep(lowerCase) + Rep1(upperCaseSequence + lowerCaseSequence)

    underbarForLink = Str('_')
    pagelinkUsingUnderbar = Rep1(AnyBut('{\n_ ')) + underbarForLink

    macro = Str('[[') + Rep1(AnyBut(']]')) + Str(']]')

    space = Str(" ")


    # for tag
    bold = Str("'''")
    italic = Str("''")
    boldanditalic = bold + italic 
    htmllefttag = Str("<")
    htmlrighttag = Str(">")
    htmlandtag = Str("&")
    enterCode = Str("\n")
    ruler = Str("----") + enterCode
    rawTextStart=Str("{{{~cpp ")
    rawTextEnd=Str("} } }")

    def repl_rawTextStart(self, aText):
        self.begin("rawtext")
        return "<PRE>"

    def repl_rawTextEnd(self, aText):
        self.begin("")
        return "</PRE>"

    lexicon = Lexicon([
                (rawTextStart, repl_rawTextStart),
                State('rawtext', [
                    (rawTextEnd, repl_rawTextEnd),
                    (AnyChar, repl_normalString),
                ]),
                (italic, repl_italic),
                (bold, repl_bold),
                (boldanditalic, repl_boldAndItalic),
                (htmllefttag, "<"),
                (htmlrighttag, ">"),
                (htmlandtag, "&"),

                (url, repl_url),
                (interwiki, repl_interwiki),
                (pagelinkUsingCamelWord, repl_pagelink),
                (ruler, repl_ruler),
                (enterCode, repl_enter),
                (pagelinkUsingUnderbar, repl_pagelinkUsingUnderbar),

                (macro, repl_macro),
                (AnyChar | space, repl_normalString),])

    def __init__(self, aStream, anInterWikiMap={}, aScriptName='pyki.cgi', aMacroList={}):
        Scanner.__init__(self, self.lexicon, aStream)
        self.interWikiMap=anInterWikiMap
        self.scriptName = aScriptName
        self.macros = aMacroList

        self.bold = False
        self.italic = False

    def linkedLine(self):
        writer = StringIO.StringIO("")
        while True:
            token = self.read()
            if token[0] is None:
                break
            writer.write(token[0])
        return writer.getvalue()

}}}

=== κ°œλ°œμ€‘ 이슈 ===
μ²˜μŒμ—λŠ” Wiki μ—μ„œ Tag 에 λŒ€ν•΄ Tagger ν΄λž˜μŠ€λΌ λ§Œλ“€κ³ , link λΌ κ±Έμ–΄μ£ΌλŠ” 뢀뢄에 λŒ€ν•΄ AutoLinker λΌ, Macro μ—λŠ” MacroApplyer λΌ κ°κ° λ§Œλ“€μ–΄μ£Όμ—ˆλ‹€. κ·ΈλŸ¬λ‹€κ°€ λ¬Έμ œκ°€ μƒκ²ΌλŠ”λ°, νƒœκ·Έμ€‘μ— κ·Έ 영ν–₯λ ₯이 κ²ΉμΉ˜λŠ” 뢀뢄이 μƒκ²¨λ‚˜κ²Œ 된 것이닀. 즉, μ˜ˆλΌ λ“ λ‹€λ©΄ Macro 의 경우 CamelWord 둜 이름지어지기도 ν•˜λŠ”λ°, μ΄λŠ” AutoLinker 의 apply λΌ κ±°μΉ˜λ©΄μ„œ archor νƒœκ·Έκ°€ λΆ™μ–΄λ²„λ¦¬λŠ” 것이닀.

해결방법 : 두가지인데, ν•˜λ‚˜λŠ” AutoLinker μ—μ„œ Macro κ΄€λ ¨ νƒœκ·Έμ‹œ λ¬΄μ‹œν•˜κ³  μ§€λ‚˜κ°€λŠ” 방법이고 ν•˜λ‚˜λŠ” AutoLinker 와 MacroApplyerλΌ ν†΅ν•©ν•˜λŠ” 방법이닀. 
μ „μžμ˜ 경우 각각의 Class Responsibility 듀을 μœ μ§€ν•œλ‹€λŠ” μž₯점이 μžˆμ§€λ§Œ, AutoLinker μ—μ„œ μ›λž˜ μƒκ°μΉ˜ μ•Šμ•˜λ˜ ν•œκ°€μ§€ 일을 더 ν•΄μ•Ό ν•œλ‹€λŠ” 점이 μžˆκ² λ‹€.
ν›„μžμ˜ 경우 ν΄λž˜μŠ€κ°€ μ»€μ§„λ‹€λŠ” 단점이 μžˆμ§€λ§Œ, μ˜λ„ν•œ lexical λ“€λ§Œ ν‘œν˜„λœλ‹€λŠ” 점과 1 pass 둜 νŒŒμ‹±μ΄ 같이 μ΄λ£¨μ–΄μ§ˆ 수 μžˆλ‹€λŠ” μž₯점이 μžˆλ‹€.

결ꡭ은 ν›„μžλΌ μ„ νƒν•˜μ˜€λ‹€. 근데, κ·ΈλŸ¬λ©΄μ„œ μ΄λ²ˆμ—” Tagger 와 AutoLinker μ–‘μͺ½μ— 영ν–₯λ ₯을 λΈμΉ  거리가 μƒκ²ΌλŠ”λ°, λ°”λ‘œ ν…μŠ€νŠΈλΌ κ·ΈλŒ€λ‘œ λ³΄μ—¬μ£ΌλŠ” νƒœκ·ΈλΆ€λΆ„μ΄λ‹€.

κ·ΈλŸ¬λ‚˜......~ ν›„μžλ‘œ μˆ˜μ •ν•˜λŠ”λ° 40뢄도 μ•ˆκ±Έλ¦¬λ‹€.; μž‘μ—…μœΌλ‘œ λ³Έλ‹€λ©΄ Parser λ‘κ°œμ˜ lexicon 을 ν•©μΉ˜λŠ” μž‘μ—…μž„μ—λ„ λΆˆκ΅¬ν•˜κ³ , κ·Έ μ•ˆμ •μ„±λ„ 보μž₯λ°›μœΌλ©΄μ„œ parser μ—μ„œ line λ‹¨μœ„ 자λ₯΄κΈ° λΆ€λΆ„κΉŒμ§€ μˆ˜μ •ν•˜μ˜€λ‹€. 맀 번 μˆ˜μ •ν• λ•Œλ§ˆλ‹€ ν…ŒμŠ€νŠΈλΌ λŒλ¦¬λ©΄μ„œ ν™•μΈν–ˆκΈ° λ•Œλ¬Έμ— κ·Έ κ²°κ³Όκ°€ 보μž₯이 λ˜μ—ˆλ‹€. Text Processing μ—μ„œ μ΄λŸ¬ν•œ 뢀뢄에 λŒ€ν•œ TDD의 νŒŒμ›ŒλŠ” 정말 ν¬λ‹€λž€ 생각이 λ“ λ‹€.

----
[Plex]
Valid XHTML 1.0! Valid CSS! powered by MoniWiki
last modified 2021-02-07 05:22:39
Processing time 0.0114 sec