changeset 69 c6bca38c1cbf
equal deleted inserted replaced
68:5ff1fc726848 69:c6bca38c1cbf
     1 # - minimal reStructuredText parser
     2 #
     3 # Copyright 2009, 2010 Matt Mackall <> and others
     4 #
     5 # This software may be used and distributed according to the terms of the
     6 # GNU General Public License version 2 or any later version.
     8 """simplified reStructuredText parser.
    10 This parser knows just enough about reStructuredText to parse the
    11 Mercurial docstrings.
    13 It cheats in a major way: nested blocks are not really nested. They
    14 are just indented blocks that look like they are nested. This relies
    15 on the user to keep the right indentation for the blocks.
    17 It only supports a small subset of reStructuredText:
    19 - sections
    21 - paragraphs
    23 - literal blocks
    25 - definition lists
    27 - specific admonitions
    29 - bullet lists (items must start with '-')
    31 - enumerated lists (no autonumbering)
    33 - field lists (colons cannot be escaped)
    35 - option lists (supports only long options without arguments)
    37 - inline literals (no other inline markup is not recognized)
    38 """
    40 import re, sys
    41 import util, encoding
    42 from i18n import _
    45 def replace(text, substs):
    46     utext = text.decode(encoding.encoding)
    47     for f, t in substs:
    48         utext = utext.replace(f, t)
    49     return utext.encode(encoding.encoding)
    52 _blockre = re.compile(r"\n(?:\s*\n)+")
    54 def findblocks(text):
    55     """Find continuous blocks of lines in text.
    57     Returns a list of dictionaries representing the blocks. Each block
    58     has an 'indent' field and a 'lines' field.
    59     """
    60     blocks = []
    61     for b in _blockre.split(text.strip()):
    62         lines = b.splitlines()
    63         indent = min((len(l) - len(l.lstrip())) for l in lines)
    64         lines = [l[indent:] for l in lines]
    65         blocks.append(dict(indent=indent, lines=lines))
    66     return blocks
    69 def findliteralblocks(blocks):
    70     """Finds literal blocks and adds a 'type' field to the blocks.
    72     Literal blocks are given the type 'literal', all other blocks are
    73     given type the 'paragraph'.
    74     """
    75     i = 0
    76     while i < len(blocks):
    77         # Searching for a block that looks like this:
    78         #
    79         # +------------------------------+
    80         # | paragraph                    |
    81         # | (ends with "::")             |
    82         # +------------------------------+
    83         #    +---------------------------+
    84         #    | indented literal block    |
    85         #    +---------------------------+
    86         blocks[i]['type'] = 'paragraph'
    87         if blocks[i]['lines'][-1].endswith('::') and i + 1 < len(blocks):
    88             indent = blocks[i]['indent']
    89             adjustment = blocks[i + 1]['indent'] - indent
    91             if blocks[i]['lines'] == ['::']:
    92                 # Expanded form: remove block
    93                 del blocks[i]
    94                 i -= 1
    95             elif blocks[i]['lines'][-1].endswith(' ::'):
    96                 # Partially minimized form: remove space and both
    97                 # colons.
    98                 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-3]
    99             else:
   100                 # Fully minimized form: remove just one colon.
   101                 blocks[i]['lines'][-1] = blocks[i]['lines'][-1][:-1]
   103             # List items are formatted with a hanging indent. We must
   104             # correct for this here while we still have the original
   105             # information on the indentation of the subsequent literal
   106             # blocks available.
   107             m = _bulletre.match(blocks[i]['lines'][0])
   108             if m:
   109                 indent += m.end()
   110                 adjustment -= m.end()
   112             # Mark the following indented blocks.
   113             while i + 1 < len(blocks) and blocks[i + 1]['indent'] > indent:
   114                 blocks[i + 1]['type'] = 'literal'
   115                 blocks[i + 1]['indent'] -= adjustment
   116                 i += 1
   117         i += 1
   118     return blocks
   120 _bulletre = re.compile(r'(-|[0-9A-Za-z]+\.|\(?[0-9A-Za-z]+\)|\|) ')
   121 _optionre = re.compile(r'^(--[a-z-]+)((?:[ =][a-zA-Z][\w-]*)?  +)(.*)$')
   122 _fieldre = re.compile(r':(?![: ])([^:]*)(?<! ):[ ]+(.*)')
   123 _definitionre = re.compile(r'[^ ]')
   125 def splitparagraphs(blocks):
   126     """Split paragraphs into lists."""
   127     # Tuples with (list type, item regexp, single line items?). Order
   128     # matters: definition lists has the least specific regexp and must
   129     # come last.
   130     listtypes = [('bullet', _bulletre, True),
   131                  ('option', _optionre, True),
   132                  ('field', _fieldre, True),
   133                  ('definition', _definitionre, False)]
   135     def match(lines, i, itemre, singleline):
   136         """Does itemre match an item at line i?
   138         A list item can be followed by an idented line or another list
   139         item (but only if singleline is True).
   140         """
   141         line1 = lines[i]
   142         line2 = i + 1 < len(lines) and lines[i + 1] or ''
   143         if not itemre.match(line1):
   144             return False
   145         if singleline:
   146             return line2 == '' or line2[0] == ' ' or itemre.match(line2)
   147         else:
   148             return line2.startswith(' ')
   150     i = 0
   151     while i < len(blocks):
   152         if blocks[i]['type'] == 'paragraph':
   153             lines = blocks[i]['lines']
   154             for type, itemre, singleline in listtypes:
   155                 if match(lines, 0, itemre, singleline):
   156                     items = []
   157                     for j, line in enumerate(lines):
   158                         if match(lines, j, itemre, singleline):
   159                             items.append(dict(type=type, lines=[],
   160                                               indent=blocks[i]['indent']))
   161                         items[-1]['lines'].append(line)
   162                     blocks[i:i + 1] = items
   163                     break
   164         i += 1
   165     return blocks
   168 _fieldwidth = 12
   170 def updatefieldlists(blocks):
   171     """Find key and maximum key width for field lists."""
   172     i = 0
   173     while i < len(blocks):
   174         if blocks[i]['type'] != 'field':
   175             i += 1
   176             continue
   178         keywidth = 0
   179         j = i
   180         while j < len(blocks) and blocks[j]['type'] == 'field':
   181             m = _fieldre.match(blocks[j]['lines'][0])
   182             key, rest = m.groups()
   183             blocks[j]['lines'][0] = rest
   184             blocks[j]['key'] = key
   185             keywidth = max(keywidth, len(key))
   186             j += 1
   188         for block in blocks[i:j]:
   189             block['keywidth'] = keywidth
   190         i = j + 1
   192     return blocks
   195 def prunecontainers(blocks, keep):
   196     """Prune unwanted containers.
   198     The blocks must have a 'type' field, i.e., they should have been
   199     run through findliteralblocks first.
   200     """
   201     pruned = []
   202     i = 0
   203     while i + 1 < len(blocks):
   204         # Searching for a block that looks like this:
   205         #
   206         # +-------+---------------------------+
   207         # | ".. container ::" type            |
   208         # +---+                               |
   209         #     | blocks                        |
   210         #     +-------------------------------+
   211         if (blocks[i]['type'] == 'paragraph' and
   212             blocks[i]['lines'][0].startswith('.. container::')):
   213             indent = blocks[i]['indent']
   214             adjustment = blocks[i + 1]['indent'] - indent
   215             containertype = blocks[i]['lines'][0][15:]
   216             prune = containertype not in keep
   217             if prune:
   218                 pruned.append(containertype)
   220             # Always delete "..container:: type" block
   221             del blocks[i]
   222             j = i
   223             while j < len(blocks) and blocks[j]['indent'] > indent:
   224                 if prune:
   225                     del blocks[j]
   226                     i -= 1 # adjust outer index
   227                 else:
   228                     blocks[j]['indent'] -= adjustment
   229                     j += 1
   230         i += 1
   231     return blocks, pruned
   234 _sectionre = re.compile(r"""^([-=`:.'"~^_*+#])\1+$""")
   236 def findsections(blocks):
   237     """Finds sections.
   239     The blocks must have a 'type' field, i.e., they should have been
   240     run through findliteralblocks first.
   241     """
   242     for block in blocks:
   243         # Searching for a block that looks like this:
   244         #
   245         # +------------------------------+
   246         # | Section title                |
   247         # | -------------                |
   248         # +------------------------------+
   249         if (block['type'] == 'paragraph' and
   250             len(block['lines']) == 2 and
   251             encoding.colwidth(block['lines'][0]) == len(block['lines'][1]) and
   252             _sectionre.match(block['lines'][1])):
   253             block['underline'] = block['lines'][1][0]
   254             block['type'] = 'section'
   255             del block['lines'][1]
   256     return blocks
   259 def inlineliterals(blocks):
   260     substs = [('``', '"')]
   261     for b in blocks:
   262         if b['type'] in ('paragraph', 'section'):
   263             b['lines'] = [replace(l, substs) for l in b['lines']]
   264     return blocks
   267 def hgrole(blocks):
   268     substs = [(':hg:`', '"hg '), ('`', '"')]
   269     for b in blocks:
   270         if b['type'] in ('paragraph', 'section'):
   271             # Turn :hg:`command` into "hg command". This also works
   272             # when there is a line break in the command and relies on
   273             # the fact that we have no stray back-quotes in the input
   274             # (run the blocks through inlineliterals first).
   275             b['lines'] = [replace(l, substs) for l in b['lines']]
   276     return blocks
   279 def addmargins(blocks):
   280     """Adds empty blocks for vertical spacing.
   282     This groups bullets, options, and definitions together with no vertical
   283     space between them, and adds an empty block between all other blocks.
   284     """
   285     i = 1
   286     while i < len(blocks):
   287         if (blocks[i]['type'] == blocks[i - 1]['type'] and
   288             blocks[i]['type'] in ('bullet', 'option', 'field')):
   289             i += 1
   290         else:
   291             blocks.insert(i, dict(lines=[''], indent=0, type='margin'))
   292             i += 2
   293     return blocks
   295 def prunecomments(blocks):
   296     """Remove comments."""
   297     i = 0
   298     while i < len(blocks):
   299         b = blocks[i]
   300         if b['type'] == 'paragraph' and b['lines'][0].startswith('.. '):
   301             del blocks[i]
   302         else:
   303             i += 1
   304     return blocks
   306 _admonitionre = re.compile(r"\.\. (admonition|attention|caution|danger|"
   307                            r"error|hint|important|note|tip|warning)::",
   308                            flags=re.IGNORECASE)
   310 def findadmonitions(blocks):
   311     """
   312     Makes the type of the block an admonition block if
   313     the first line is an admonition directive
   314     """
   315     i = 0
   316     while i < len(blocks):
   317         m = _admonitionre.match(blocks[i]['lines'][0])
   318         if m:
   319             blocks[i]['type'] = 'admonition'
   320             admonitiontitle = blocks[i]['lines'][0][3:m.end() - 2].lower()
   322             firstline = blocks[i]['lines'][0][m.end() + 1:]
   323             if firstline:
   324                 blocks[i]['lines'].insert(1, '   ' + firstline)
   326             blocks[i]['admonitiontitle'] = admonitiontitle
   327             del blocks[i]['lines'][0]
   328         i = i + 1
   329     return blocks
   331 _admonitiontitles = {'attention': _('Attention:'),
   332                      'caution': _('Caution:'),
   333                      'danger': _('!Danger!')  ,
   334                      'error': _('Error:'),
   335                      'hint': _('Hint:'),
   336                      'important': _('Important:'),
   337                      'note': _('Note:'),
   338                      'tip': _('Tip:'),
   339                      'warning': _('Warning!')}
   341 def formatblock(block, width):
   342     """Format a block according to width."""
   343     if width <= 0:
   344         width = 78
   345     indent = ' ' * block['indent']
   346     if block['type'] == 'admonition':
   347         admonition = _admonitiontitles[block['admonitiontitle']]
   348         hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
   350         defindent = indent + hang * ' '
   351         text = ' '.join(map(str.strip, block['lines']))
   352         return '%s\n%s' % (indent + admonition, util.wrap(text, width=width,
   353                                            initindent=defindent,
   354                                            hangindent=defindent))
   355     if block['type'] == 'margin':
   356         return ''
   357     if block['type'] == 'literal':
   358         indent += '  '
   359         return indent + ('\n' + indent).join(block['lines'])
   360     if block['type'] == 'section':
   361         underline = encoding.colwidth(block['lines'][0]) * block['underline']
   362         return "%s%s\n%s%s" % (indent, block['lines'][0],indent, underline)
   363     if block['type'] == 'definition':
   364         term = indent + block['lines'][0]
   365         hang = len(block['lines'][-1]) - len(block['lines'][-1].lstrip())
   366         defindent = indent + hang * ' '
   367         text = ' '.join(map(str.strip, block['lines'][1:]))
   368         return '%s\n%s' % (term, util.wrap(text, width=width,
   369                                            initindent=defindent,
   370                                            hangindent=defindent))
   371     subindent = indent
   372     if block['type'] == 'bullet':
   373         if block['lines'][0].startswith('| '):
   374             # Remove bullet for line blocks and add no extra
   375             # indention.
   376             block['lines'][0] = block['lines'][0][2:]
   377         else:
   378             m = _bulletre.match(block['lines'][0])
   379             subindent = indent + m.end() * ' '
   380     elif block['type'] == 'field':
   381         keywidth = block['keywidth']
   382         key = block['key']
   384         subindent = indent + _fieldwidth * ' '
   385         if len(key) + 2 > _fieldwidth:
   386             # key too large, use full line width
   387             key = key.ljust(width)
   388         elif keywidth + 2 < _fieldwidth:
   389             # all keys are small, add only two spaces
   390             key = key.ljust(keywidth + 2)
   391             subindent = indent + (keywidth + 2) * ' '
   392         else:
   393             # mixed sizes, use fieldwidth for this one
   394             key = key.ljust(_fieldwidth)
   395         block['lines'][0] = key + block['lines'][0]
   396     elif block['type'] == 'option':
   397         m = _optionre.match(block['lines'][0])
   398         option, arg, rest = m.groups()
   399         subindent = indent + (len(option) + len(arg)) * ' '
   401     text = ' '.join(map(str.strip, block['lines']))
   402     return util.wrap(text, width=width,
   403                      initindent=indent,
   404                      hangindent=subindent)
   407 def format(text, width, indent=0, keep=None):
   408     """Parse and format the text according to width."""
   409     blocks = findblocks(text)
   410     for b in blocks:
   411         b['indent'] += indent
   412     blocks = findliteralblocks(blocks)
   413     blocks, pruned = prunecontainers(blocks, keep or [])
   414     blocks = findsections(blocks)
   415     blocks = inlineliterals(blocks)
   416     blocks = hgrole(blocks)
   417     blocks = splitparagraphs(blocks)
   418     blocks = updatefieldlists(blocks)
   419     blocks = prunecomments(blocks)
   420     blocks = addmargins(blocks)
   421     blocks = findadmonitions(blocks)
   422     text = '\n'.join(formatblock(b, width) for b in blocks)
   423     if keep is None:
   424         return text
   425     else:
   426         return text, pruned
   429 if __name__ == "__main__":
   430     from pprint import pprint
   432     def debug(func, *args):
   433         blocks = func(*args)
   434         print "*** after %s:" % func.__name__
   435         pprint(blocks)
   436         print
   437         return blocks
   439     text = open(sys.argv[1]).read()
   440     blocks = debug(findblocks, text)
   441     blocks = debug(findliteralblocks, blocks)
   442     blocks, pruned = debug(prunecontainers, blocks, sys.argv[2:])
   443     blocks = debug(inlineliterals, blocks)
   444     blocks = debug(splitparagraphs, blocks)
   445     blocks = debug(updatefieldlists, blocks)
   446     blocks = debug(findsections, blocks)
   447     blocks = debug(prunecomments, blocks)
   448     blocks = debug(addmargins, blocks)
   449     blocks = debug(findadmonitions, blocks)
   450     print '\n'.join(formatblock(b, 30) for b in blocks)