fellowiki/controllers/wikiparser/parser.py

# pylint: disable-msg=W0613,E0201
# W0702, W0706: allow argument magic (i.e. attributes_from_dict())

# Copyright (c) 2006 Jan Niklas Fingerle
# 
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

"""fellowiki wiki parser

TODO: overview wiki parser
    
"""

from fellowiki.util.assorted import attributes_from_dict
import elementtree.ElementTree as ElementTree
from fellowiki.util.xmlelement import XMLElement
from util import remove_backslashes_and_whitespace, remove_escaping_backslashes
      
import sre
import re
from copy import deepcopy


PREFIX_COUNT = "prefix count"


PROC = 'procedure'

WHITESPACE = 'white space'
TEXT = 'text'
LINEBREAK = 'line break'
STRUCTUREMOD = 'structure modifier'
PARAGRAPH = 'paragraph'
END_TOKEN = 'end token'
ENCAPSULATE_TOKEN = 'encapsulate token'
PREFIX = 'prefix'

class WikiError(StandardError):
    pass

class WikiParserError(WikiError):
    pass

class WikiTokenError(WikiError):
    pass
    
class Token(object):
    def __init__(self, token, cut_right = None, cut_left = None,
                     decode_backslash = False, preference = None, **attr_dict):
        attributes_from_dict(dict(locals().items()+attr_dict.items()))
        try:
            cut_right = -cut_right
        except TypeError: 
            pass
        self.text = token[cut_left:cut_right]
            
        if decode_backslash:
            self.text = remove_escaping_backslashes(self.text)
            
    def render(self, new_token):
        new_token.prepend(self.token)
        
    def match_is_open(self):
        return False
    
    def close_matching(self, match):
        return False
    
    def evaluate(self, result, tokens, state, procs):
        self.result = result
        self.tokens = tokens
        self.state = state
        
        new_token = ResultToken()
        
        # fold result set where appropriate (e.g. higher priority) and
        # close matching "braces"
        while result \
            and (self.preference is not None 
                    and (result[-1].preference > self.preference 
                        or self.preference < 0
                        or len(result) > 1 and result[-1].preference is None 
                            and (result[-2].preference is None 
                                or result[-2].preference > self.preference)
                        or self.match_is_open()) 
                or self.preference is None 
                    and result[-1].preference is None):
            res = result.pop()
            if self.close_matching(res):
                # matching braces generate a new token that will be evaluated
                # as a new token
                self.close(res, new_token)
                return
            else:
                res.render(new_token)
                
        if new_token.xhtml.is_not_empty(): 
            result.append(new_token)
            
        if not self.is_a(END_TOKEN):
            result.append(self)
    
    def is_a(self, *capabilities):
        return False
            

class ResultToken(Token):
    def __init__(self):
        attributes_from_dict(locals())
        self.preference = None
        self.xhtml = XMLElement('div')
        
    def render(self, new_token):
        new_token.prepend_element_contents(self.xhtml)
    
    def prepend(self, *prependee):
        self.xhtml.prepend(*prependee)
        
    def append(self, *appendee):
        self.xhtml.append(*appendee)
    
    def prepend_element_contents(self, element):
        self.xhtml.prepend(*element.content)
        
    def append_element_contents(self, element): 
        self.xhtml.append(*element.content)

class EndToken(Token):
    def __init__(self):
        attributes_from_dict(locals())
        self.preference = -1
        
    def render(self, new_token):
        pass
        
    def match_is_open(self):
        return True
    
    def is_a(self, *capabilities):
        return END_TOKEN in capabilities

class TextToken(Token):
    def __init__(self, token, new_text = None, **attr_dict):
        Token.__init__(self, token, new_text = new_text, **attr_dict)
        
    def render(self, new_token):
        if self.new_text is None:
            text = re.sub('[ \n\t]+', ' ', self.text)
        else:
            text = re.sub('[ \n\t]+', ' ', self.new_text)
        new_token.prepend(text)

class WhitespaceToken(TextToken):
    def is_a(self, *capabilities):
        return WHITESPACE in capabilities

class PrefixToken(Token):
    def __init__(self, token, *args, **kwargs):
        self.has_been_prefixed = False
        Token.__init__(self, token, *args, **kwargs)
        
    def evaluate(self, result, tokens, state, procs):
        state[PREFIX_COUNT] = state.get(PREFIX_COUNT, 0) + 1
        Token.evaluate(self, result, tokens, state, procs)
        
    def render(self, new_token):
        self.state[PREFIX_COUNT] = self.state.get(PREFIX_COUNT, 0) - 1
        Token.render(self, new_token)
    
    def prefix(self, new_token):
        self.has_been_prefixed = True
        if new_token.xhtml.is_empty():
            self.tokens.insert(0, new_token)
            self.tokens.insert(0, self)
        else:
            self.do_prefix(new_token)    
            
    def is_a(self, *capabilities):
        return PREFIX in capabilities
          
class LineBreakToken(Token):
    def render(self, new_token):
        new_token.prepend(' ')
        
    def evaluate(self, result, tokens, state, procs):
        if self.is_a(LINEBREAK) and tokens[0].is_a(LINEBREAK):
            next_token = tokens.pop(0)
            tokens.insert(0, ParagraphToken(self.token + next_token.token))
        else:
            Token.evaluate(self, result, tokens, state, procs)
    
    def match_is_open(self):
        return self.state.get(PREFIX_COUNT, 0) > 0
    
    def close_matching(self, match):
        return match.is_a(PREFIX) and not match.has_been_prefixed
        
    def close(self, match, new_token):
        self.state[PREFIX_COUNT] = self.state.get(PREFIX_COUNT, 0) - 1
        if self.state[PREFIX_COUNT] > 0 or not self.is_a(LINEBREAK):
            self.tokens.insert(0, self)
        match.prefix(new_token)
        
    def is_a(self, *capabilities):
        return LINEBREAK in capabilities

           
class ParagraphToken(LineBreakToken):
    def __init__(self, token, *args, **kwargs):
        LineBreakToken.__init__(self, token, *args, **kwargs)
        self.preference = 0
        self.xhtml = XMLElement('div')
        self.modifiers = {}
        
    def match_is_open(self):
        return True
    
    def close_matching(self, match):
        return match.is_a(PARAGRAPH) \
            or LineBreakToken.close_matching(self, match) 
           
    def evaluate(self, result, tokens, state, procs): 
        while result and result[-1].is_a(LINEBREAK):
            previous_result = result.pop()
            self.token = previous_result.token + self.token
        
        caught_line_break = False
        
        while tokens[0].is_a(WHITESPACE, STRUCTUREMOD, LINEBREAK, PARAGRAPH):
            next_token = tokens.pop(0)
            if next_token.is_a(PARAGRAPH) or \
                caught_line_break and next_token.is_a(LINEBREAK):
                self.modifiers = {}
            caught_line_break = next_token.is_a(LINEBREAK)
            try:
                next_token.modify(self.modifiers)
            except AttributeError:
                pass
            
        LineBreakToken.evaluate(self, result, tokens, state, procs)
        
    def close(self, match, new_token):
        if LineBreakToken.close_matching(self, match):
            LineBreakToken.close(self, match, new_token)
        else:
            if new_token.xhtml.is_not_empty():
                new_token.xhtml.tag = 'p'
                
                for (key, value) in match.modifiers.items():
                    if key == 'align':
                        new_token.xhtml.attributes['class'] = 'align-%s' % value
                match.modifiers = self.modifiers   ### shouldn't we reconsider and insert self?
                
                match.xhtml.append(new_token.xhtml)
            self.tokens.insert(0, match)
            self.do_extended_close(match)
            
    def do_extended_close(self, inserted_token):
        pass
        
    def render(self, new_token):
        new_token.prepend_element_contents(self.xhtml)
            
    def is_a(self, *capabilities):
        return PARAGRAPH in capabilities
 
class ParagraphSeparatorToken(ParagraphToken):
    is_a = Token.is_a
 
class EncapsulateToken(Token):
    def __init__(self, token, *args, **kwargs):
        Token.__init__(self, token, *args, **kwargs)
        self.consumed_whitespace_left = False
        self.consumed_whitespace_right = False
        
    def render(self, new_token):
        if self.type == '(':
            self.state[self.STATE] = self.state.get(self.STATE, 0) - 1
        Token.render(self, new_token)
    
    def evaluate(self, result, tokens, state, procs):
        ## there can be only one WhitespaceToken in a row
        if self.type <> '(' and result and result[-1].is_a(WHITESPACE):
            result.pop()
            if not self.consumed_whitespace_left:
                self.consumed_whitespace_left = True
                self.token = ' ' +  self.token
        while self.type <> ')' and tokens and tokens[0].is_a(WHITESPACE):
            tokens.pop(0)
            if not self.consumed_whitespace_right:
                self.consumed_whitespace_right = True
                self.token = self.token + ' '
        
        Token.evaluate(self, result, tokens, state, procs)
        if self.type == '(':
            state[self.STATE] = state.get(self.STATE, 0) + 1
        
    def match_is_open(self):
        if self.type == ')':
            return self.state.get(self.STATE, 0) > 0
        return False
    
    def close_matching(self, match):
        return (self.state.get(self.STATE, 0) == 1 and
                self.type == ')' and match.is_a(ENCAPSULATE_TOKEN) and 
                self.STATE == match.STATE and self.text == match.text and 
                match.type in ('(', '_'))
            
    def close(self, match, new_token):
        if match.type == '(':
            self.state[self.STATE] = self.state.get(self.STATE, 0) - 1
        self.insert_result(match, new_token)    
            
    def insert_result(self, match, result_token):
        self.tokens.insert(0, result_token)
        
    def is_a(self, *capabilities):
        return ENCAPSULATE_TOKEN in capabilities
   

class BetweenParagraphsXHTML(ParagraphSeparatorToken):
    def __init__(self, token, xhtml):
        ParagraphToken.__init__(self, token)
        self.xhtml = xhtml
    
    def do_extended_close(self, inserted_token):
        inserted_token.xhtml.append(self.xhtml)
        
def _token_factory(token_cls, kw_args):
    def new_token(_, token):
        return token_cls(token, **kw_args)
    return new_token    
        
class WikiParser(object):
    regexes = {LINEBREAK: (40, r'[ \t]*\n[ \t]*', LineBreakToken, dict(preference = 20)),
               WHITESPACE: (90, r'[ \t]+', WhitespaceToken, dict()),
               TEXT: (99, r'.[a-zA-Z0-9]*', TextToken, dict())}

    def __init__(self, procs, extensions):
        self.regexes = deepcopy(self.regexes) # copy per instance
        for extension in extensions:
            extension.extend_wiki_parser(self)
        regexes = self.regexes.values()
        regexes.sort()
        regexes = [(rex, _token_factory(class_, kw_args)) for 
                                    (_, rex, class_, kw_args) in regexes]
        self.scanner = sre.Scanner(regexes, sre.M)
        self.procs = procs

    def parse(self, text):
        import time
        
        text = re.sub('\r\n', '\n', text)
        tokens = self.scanner.scan(''.join(['\n\n', text, '\n\n']))
        
        if tokens[1] != '':
            raise WikiParserError('WikiParser error in "%s"' % text)
        
        tokens = tokens[0]
        tokens.append(EndToken())
        result = []
        state = {}
        
        while tokens:
            token = tokens.pop(0)
            token.evaluate(result, tokens, state, self.procs)
        
        if bool(tokens):
            raise WikiParserError('WikiParser error in "%s"' % text)
        if len(result) > 1:
            raise WikiParserError('WikiParser error in "%s"' % text)
            
        if len(result) == 1:    
            xhtml_tree, translations = result[0].xhtml.to_element_tree()
            
        else:
            xhtml_tree = ElementTree.Element('div')
            translations = []
        
        xhtml_tree.set('class','parsed-wiki-content')
        
        return (xhtml_tree, translations)
        
    def evaluate(self, tree):
        tree_  = deepcopy(tree)
        for trans in tree_[1]:
            self.procs[trans[1]][PROC](trans[0], *trans[2])
        return tree_[0]
fellowiki documentation built on Sept. 19, 2017, 7:34 p.m.