scripts/munge.py
changeset 480 9b07ddeb1412
child 506 deaf548efde3
equal deleted inserted replaced
479:50bab5e71a66 480:9b07ddeb1412
       
     1 #!/usr/bin/python2.5
       
     2 #
       
     3 # Copyright 2008 the Melange authors.
       
     4 #
       
     5 # Licensed under the Apache License, Version 2.0 (the "License");
       
     6 # you may not use this file except in compliance with the License.
       
     7 # You may obtain a copy of the License at
       
     8 #
       
     9 #   http://www.apache.org/licenses/LICENSE-2.0
       
    10 #
       
    11 # Unless required by applicable law or agreed to in writing, software
       
    12 # distributed under the License is distributed on an "AS IS" BASIS,
       
    13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    14 # See the License for the specific language governing permissions and
       
    15 # limitations under the License.
       
    16 
       
    17 # __doc__ string is slightly unconventional because it is used as usage text
       
    18 """%prog [OPTIONS] [FIND_REGEX] [REPLACE_FORMAT]
       
    19 
       
    20 Script to list, search, and modify files using Python regex patterns.
       
    21 
       
    22 OPTIONS:  optional command-line flags; see %prog --help
       
    23 
       
    24 FIND_REGEX:  an optional valid Python regular expression pattern;
       
    25   if supplied, only files containing at least one match will be processed;
       
    26   matching file paths will be printed; if supplied, REPLACE_FORMAT will be
       
    27   used to convert the match groups into formatted output.
       
    28 
       
    29 REPLACE_FORMAT:  an optional valid Python format string;
       
    30   FIND_REGEX must be supplied first if REPLACE_FORMAT is supplied;
       
    31   positional arguments will be replaced with ordered groups from
       
    32   FIND_REGEX matches, and named arguments will be replaced with named
       
    33   groups from FIND_REGEX matches."""
       
    34 
       
    35 __authors__ = [
       
    36   '"Todd Larsen" <tlarsen@google.com>',
       
    37 ]
       
    38 
       
    39 
       
    40 import dircache
       
    41 import errno
       
    42 import os
       
    43 import optparse
       
    44 import re
       
    45 import sre_constants
       
    46 import sys
       
    47 
       
    48 
       
    49 class Error(Exception):
       
    50   """Base class of all exceptions in this module.
       
    51   """
       
    52   pass
       
    53 
       
    54 
       
    55 def compileRegex(pattern):
       
    56   """Compiles a Python regex pattern into a regex object.
       
    57 
       
    58   Args:
       
    59     pattern: valid Python regex pattern string, or an already-compiled
       
    60       regex object (in which case this function is is a no-op)
       
    61 
       
    62   Returns:
       
    63     regex object compiled from pattern
       
    64 
       
    65   Raises:
       
    66     Error if pattern could not be compiled.
       
    67   """
       
    68   try:
       
    69     return re.compile(pattern)
       
    70   except sre_constants.error, error:
       
    71     msg = 're.compile: %s\n%s' % (error.args[0], pattern)
       
    72     raise Error(errno.EINVAL, msg)
       
    73 
       
    74 
       
    75 def findAll(text_to_search, pattern):
       
    76   """Returns all matches of a regex in a string.
       
    77   
       
    78   Args:
       
    79     text_to_search: string in which to find matches
       
    80     pattern: Python regex pattern (or already-compiled regex object)
       
    81       indicating which matches to retrieve
       
    82 
       
    83   Returns:
       
    84     a (possibly empty) list of the matches found, as strings 
       
    85   """
       
    86   matches = []
       
    87 
       
    88   def _captureMatchText(match):
       
    89     match_text = match.group()
       
    90     matches.append(match_text)
       
    91     return match_text
       
    92 
       
    93   compileRegex(pattern).sub(_captureMatchText, text_to_search)
       
    94 
       
    95   return matches 
       
    96 
       
    97 
       
    98 def getFileContents(file_path):
       
    99   """Reads the contents of a file as a single string, then closes the file.
       
   100   
       
   101   Args:
       
   102     file_path: path to the file to read its contents into a string
       
   103     
       
   104   Returns:
       
   105     a single string containing the entire contents of the file
       
   106   """
       
   107   file_to_read = open(file_path)
       
   108   file_contents = file_to_read.read()
       
   109   file_to_read.close()
       
   110   return file_contents
       
   111 
       
   112 
       
   113 def findAllInFile(file_path, pattern, *ignored_args, **ignored_kwargs): 
       
   114   """Action to return a list of all pattern matches in a file.
       
   115   
       
   116   Args:
       
   117     file_path: path of file to manipulate
       
   118     pattern: see findAll()
       
   119     *ignored_args: other positional arguments which are ignored
       
   120       command-line arguments not used by this action callable
       
   121     **ignored_kwargs: other keyword arguments which are ignored
       
   122       command-line options not used by this action callable
       
   123     
       
   124   Returns:
       
   125     two-tuple of boolean indicating if any match was found and a
       
   126     (possibly empty) list of the matches found, as strings (to be used
       
   127     as printable output of the action)
       
   128   """
       
   129   matches = findAll(getFileContents(file_path), pattern)
       
   130 
       
   131   if matches:
       
   132     found = True
       
   133   else:
       
   134     found = False
       
   135 
       
   136   return found, matches
       
   137 
       
   138 
       
   139 def replaceAll(original, pattern, format):
       
   140   """Substitutes formatted text for all matches in a string.
       
   141   
       
   142   Args:
       
   143     original: original string in which to find and replace matches
       
   144     pattern: Python regex pattern (or already-compiled regex object)
       
   145       indicating which matches to replace
       
   146     format: Python format string specifying how to format the
       
   147       replacement text; how this format string is interpreted depends
       
   148       on the contents of the pattern;  if the pattern contains:
       
   149         named groups: format is expected to contain named format specifiers
       
   150         unnamed groups: format is expected to contain exactly the same
       
   151           number of unnamed format specifiers as the number of groups in
       
   152           pattern
       
   153         no groups: format is expected to contain a single format specifier
       
   154           (in which case the entire match is supplied to it), or no format
       
   155           specifier at all (in which case the "format" string simply
       
   156           replaces the match with no substitutions from the match itself)
       
   157 
       
   158   Returns:
       
   159     two-tuple of the text with all matches replaced as specified by
       
   160     pattern and format, and a list of the original matches, each followed
       
   161     by its replacement 
       
   162   """
       
   163   matches_and_replacements = []
       
   164 
       
   165   def _replaceWithFormat(match):
       
   166     formatted_match = None
       
   167 
       
   168     if match.groupdict():
       
   169       try:
       
   170         formatted_match = format % match.groupdict()
       
   171       except TypeError:
       
   172         pass
       
   173 
       
   174     if (not formatted_match) and match.groups():
       
   175       try:
       
   176         formatted_match = format % match.groups()
       
   177       except TypeError:
       
   178         pass
       
   179 
       
   180     if (not formatted_match):
       
   181       try:
       
   182         formatted_match = format % match.group()
       
   183       except TypeError:
       
   184         formatted_match = format
       
   185 
       
   186     matches_and_replacements.append(match.group())
       
   187     matches_and_replacements.append(formatted_match)
       
   188     return formatted_match
       
   189 
       
   190   replaced = compileRegex(pattern).sub(_replaceWithFormat, original)
       
   191 
       
   192   return replaced, matches_and_replacements
       
   193 
       
   194 
       
   195 def writeAltFileIfExt(path, ext, contents):
       
   196   """Writes a file if path and additional extension are supplied.
       
   197 
       
   198   If path or ext are not supplied, no file is written.
       
   199 
       
   200   Args:
       
   201     path: path of file to be written, to which ext will be appended
       
   202     ext: additional file extension that will be appended to path
       
   203     contents: contents of file to be written, as a string
       
   204   """
       
   205   if (not path) or (not ext):
       
   206     return
       
   207 
       
   208   if ext.startswith('.'):
       
   209     ext = ext[1:]
       
   210  
       
   211   alt_path = '%s.%s' % (path, ext)
       
   212   alt_file = open(alt_path, 'w')
       
   213   alt_file.write(contents) 
       
   214   alt_file.close()
       
   215 
       
   216 
       
   217 def replaceAllInFile(file_path, pattern, format,
       
   218                      new_ext=None, backup_ext=None,
       
   219                      overwrite_files=False,
       
   220                      *ignored_args, **ignored_kwargs): 
       
   221   """Substitutes formatted text for all matches in a file.
       
   222   
       
   223   Args:
       
   224     file_path: path of file to manipulate
       
   225     pattern, format: see replaceAll()
       
   226     *ignored_args: other positional arguments which are ignored
       
   227       command-line arguments not used by this action callable
       
   228     **ignored_kwargs: other keyword arguments which are ignored
       
   229       command-line options not used by this action callable
       
   230     
       
   231   Returns:
       
   232     two-tuple of boolean indicating if any match was found and a
       
   233     list of printable output text lines containing pairs of original
       
   234     pattern matches each followed by the formatted replacement
       
   235   """
       
   236   original = getFileContents(file_path)
       
   237 
       
   238   replaced, matches_and_replacements = replaceAll(
       
   239     original, pattern, format)
       
   240 
       
   241   if matches_and_replacements:
       
   242     found = True
       
   243     writeAltFileIfExt(file_path, new_ext, replaced)
       
   244     writeAltFileIfExt(file_path, backup_ext, original)
       
   245 
       
   246     if overwrite_files:
       
   247       if replaced != original:
       
   248         replaced_file = open(file_path, 'w')
       
   249         replaced_file.write(replaced)
       
   250         replaced_file.close()
       
   251   else:
       
   252     found = False
       
   253 
       
   254   return found, matches_and_replacements
       
   255 
       
   256 
       
   257 def listFile(*ignored_args, **ignored_kwargs): 
       
   258   """No-op action callable that ignores arguments and returns (True, []).
       
   259   """
       
   260   return True, []  # match only based on file names, which was done by caller
       
   261 
       
   262 
       
   263 def applyActionToFiles(action, action_args,
       
   264                        start_path='', abs_path=False, files_pattern='',
       
   265                        recurse_dirs=False, dirs_pattern='',
       
   266                        follow_symlinks=False, quiet_output=False,
       
   267                        hide_paths=False, **action_options):
       
   268   """Applies a callable action to files, based on options and arguments.
       
   269   
       
   270   Args:
       
   271     action: callable that expects a file path argument, positional arguments
       
   272       (action_args), and keyword options from the command-line options dict;
       
   273       and returns a "matched" boolean and a list of output strings
       
   274     action_args: list of positional arguments, if any; passed to action
       
   275       callable unchanged
       
   276     start_path: required path of initial directory to visit
       
   277     abs_path: optional boolean indicating to use absolute paths
       
   278     files_pattern: required Python regex (object or pattern) which selects
       
   279       which files to pass to the action callable
       
   280     recurse_dirs: boolean indicating if subdirectories should be traversed 
       
   281     dirs_pattern: Python regex (object or pattern) which selects which
       
   282       subdirectories to traverse if recurse_dirs is True
       
   283     follow_symlinks: boolean indicating if symlinks should be traversed
       
   284     quiet_output: optional boolean indicating if output should be suppressed
       
   285     hide_paths: optional boolean indicating to omit file paths from output
       
   286     **action_options: remaining keyword arguments that are passed unchanged
       
   287       to the action callable
       
   288 
       
   289   Returns:
       
   290     two-tuple containing an exit code and a (possibly empty) list of
       
   291     output strings
       
   292     
       
   293   Raises:
       
   294     Error exception if problems occur (file I/O, invalid regex, etc.).
       
   295   """
       
   296   exit_code = errno.ENOENT
       
   297   output = []
       
   298 
       
   299   start_path = os.path.expandvars(os.path.expanduser(start_path))
       
   300 
       
   301   if abs_path:
       
   302     start_path = os.path.abspath(start_path)
       
   303 
       
   304   paths = [start_path]
       
   305 
       
   306   files_regex = compileRegex(files_pattern)
       
   307   
       
   308   if recurse_dirs:
       
   309     dirs_regex = compileRegex(dirs_pattern)
       
   310 
       
   311   while paths:
       
   312     sub_paths = []
       
   313 
       
   314     for path in paths:
       
   315       # expand iterator into an actual list and sort it
       
   316       try:
       
   317         items = dircache.listdir(path)[:]
       
   318       except (IOError, OSError), error:
       
   319         raise Error(error.args[0], '%s: %s' % (
       
   320                     error.__class__.__name__, error.args[1]))
       
   321 
       
   322       items.sort()
       
   323 
       
   324       for item in items:
       
   325         item_path = os.path.join(path, item)
       
   326 
       
   327         if os.path.islink(item_path):
       
   328           if not follow_symlinks:
       
   329             continue  # do not follow symlinks (ignore them)
       
   330 
       
   331         if os.path.isdir(item_path):
       
   332           if recurse_dirs:
       
   333             if dirs_regex.match(item):
       
   334               sub_paths.append(item_path)
       
   335           continue
       
   336       
       
   337         if files_regex.match(item):
       
   338           try:
       
   339             matched, found_output = action(item_path, *action_args,
       
   340                                            **action_options)
       
   341           except (IOError, OSError), error:
       
   342             raise Error(error.args[0], '%s: %s' % (
       
   343                         error.__class__.__name__, error.args[1]))
       
   344 
       
   345           if matched:
       
   346             exit_code = 0  # at least one matched file has now been found
       
   347 
       
   348             if (not quiet_output) and (not hide_paths):
       
   349               output.append(item_path)
       
   350 
       
   351           if not quiet_output:
       
   352             output.extend(found_output)
       
   353 
       
   354     paths = sub_paths
       
   355   
       
   356   return exit_code, output
       
   357 
       
   358 
       
   359 class _ErrorOptionParser(optparse.OptionParser):
       
   360   """Customized optparse.OptionParser that does not call sys.exit().
       
   361   """
       
   362 
       
   363   def error(self, msg):
       
   364     """Raises an Error exception, instead of calling sys.exit().
       
   365     """
       
   366     raise Error(errno.EINVAL, msg)
       
   367 
       
   368 
       
   369 def _buildParser():
       
   370   """Returns a custom OptionParser for parsing command-line arguments.
       
   371   """
       
   372   parser = _ErrorOptionParser(__doc__)
       
   373 
       
   374   filter_group = optparse.OptionGroup(parser,
       
   375     'File Options',
       
   376     'Options used to select which files to process.')
       
   377 
       
   378   filter_group.add_option(
       
   379     '-f', '--files', dest='files_pattern', default='^.*$',
       
   380     metavar='FILES_REGEX',
       
   381     help=('Python regex pattern (*not* a glob!) defining files to process'
       
   382           ' in each directory [default: %default]'))
       
   383 
       
   384   filter_group.add_option(
       
   385     '-F', '--follow', dest='follow_symlinks', default=False,
       
   386     action='store_true',
       
   387     help=('follow file and subdirectory symlinks (possibly *DANGEROUS*)'
       
   388           ' [default: %default]'))
       
   389 
       
   390   parser.add_option_group(filter_group)
       
   391 
       
   392   dir_group = optparse.OptionGroup(parser,
       
   393     'Directory Options',
       
   394     'Options used to indicate which directories to traverse.')
       
   395 
       
   396   dir_group.add_option(
       
   397     '-s', '--start', dest='start_path', default=os.curdir, metavar='PATH',
       
   398     help='directory in which to start processing files [default: %default]')
       
   399 
       
   400   dir_group.add_option(
       
   401     '-R', '--recursive', dest='recurse_dirs', default=False,
       
   402     action='store_true',
       
   403     help='recurse into subdirectories [default: %default]')
       
   404 
       
   405   dir_group.add_option(
       
   406     '-d', '--dirs', dest='dirs_pattern', default='^.*$',
       
   407     metavar='SUBDIRS_REGEX',
       
   408     help=('Python regex pattern (*not* a glob!) defining subdirectories to'
       
   409           ' recurse into (if --recursive) [default: %default]'))
       
   410 
       
   411   parser.add_option_group(dir_group)
       
   412 
       
   413   output_group = optparse.OptionGroup(parser,
       
   414     'Output Options',
       
   415     'Options used to control program output.')
       
   416 
       
   417   output_group.add_option(
       
   418     '-a', '--abspath', dest='abs_path', default=False, action='store_true',
       
   419     help=('output absolute paths instead of relative paths'
       
   420           ' [default: %default]'))
       
   421 
       
   422   output_group.add_option(
       
   423     '-p', '--nopaths', dest='hide_paths', default=False, action='store_true',
       
   424     help=('suppress printing of file path names for successfully matched'
       
   425           ' files to stdout [default: %default]'))
       
   426 
       
   427   output_group.add_option(
       
   428     '-q', '--quiet', dest='quiet_output', default=False, action='store_true',
       
   429     help=('suppress *all* printed output to stdout (but still perform'
       
   430           ' replacements if specified) [default: %default]'))
       
   431 
       
   432   parser.add_option_group(output_group)
       
   433 
       
   434   replace_group = optparse.OptionGroup(parser,
       
   435     'Replace Options',
       
   436     'Options applied when matches in files are replaced with substitutions.'
       
   437     ' (Only possible if REPLACE_FORMAT is supplied.)')
       
   438 
       
   439   replace_group.add_option(
       
   440     '-o', '--overwrite', dest='overwrite_files', default=False,
       
   441     action='store_true',
       
   442     help=('overwrite original files with formatted text substituted for'
       
   443           ' matches [default: %default]'))  
       
   444 
       
   445   replace_group.add_option(
       
   446     '-b', '--backup', dest='backup_ext', default='', metavar='EXTENSION',
       
   447     help=('if supplied, and file would be overwritten, backup original'
       
   448           ' file with the supplied extension [default is no backups of'
       
   449           ' overwritten files are kept]'))
       
   450 
       
   451   replace_group.add_option(
       
   452     '-n', '--new', dest='new_ext', default='', metavar='EXTENSION',
       
   453     help=('if supplied, and file has matches and and is altered by'
       
   454           ' substitutions, create a new file with the supplied extension'
       
   455           ' [default is no new file is created]'))
       
   456 
       
   457   parser.add_option_group(replace_group)
       
   458 
       
   459   return parser
       
   460 
       
   461 
       
   462 def _parseArgs(cmd_line_args):
       
   463   """Builds a command-line option parser and parses command-line arguments.
       
   464   
       
   465   Args:
       
   466     cmd_line_args: command-line arguments, excluding the argv[0] program name
       
   467     
       
   468   Returns:
       
   469     four-tuple of action callable, supplied command-line options (including
       
   470     those defined by defaults in the command-line parser) as a dict,
       
   471     remaining positional command-line arguments, and the parser itself
       
   472     
       
   473   Raises:
       
   474     Error if problems occurred during commmand-line argument parsing.
       
   475   """
       
   476   parser = _buildParser()
       
   477   options, args = parser.parse_args(args=cmd_line_args)
       
   478 
       
   479   if not args:
       
   480     # no FIND_REGEX or REPLACE_PATTERN supplied, so just match based
       
   481     # on file name and subdirectory name patterns
       
   482     action = listFile
       
   483   elif len(args) == 1:
       
   484     # FIND_REGEX supplied, but not REPLACE_PATTERN, so just match based
       
   485     # on file name and subdirectory name patterns, and then on file
       
   486     # contents
       
   487     action = findAllInFile
       
   488   elif len(args) == 2:
       
   489     # FIND_REGEX and REPLACE_PATTERN both supplied, so match based
       
   490     # on file name and subdirectory name patterns, and then do a find and
       
   491     # replace on file contents
       
   492     action = replaceAllInFile
       
   493   else:
       
   494     raise Error(errno.EINVAL,'too many (%d) arguments supplied:\n%s' % (
       
   495                 len(args), ' '.join(args)))
       
   496 
       
   497   return action, vars(options), args, parser
       
   498 
       
   499  
       
   500 def _main(argv):
       
   501   """Wrapper that catches exceptions, prints output, and returns exit status.
       
   502   
       
   503   Normal program output is printed to stdout.  Error output (including
       
   504   exception text) is printed to stderr.
       
   505   
       
   506   Args:
       
   507     argv: script arguments, usually sys.argv; argv[0] is expected to be the
       
   508       program name
       
   509       
       
   510   Returns:
       
   511     exit code suitable for sys.exit()
       
   512   """
       
   513   options = {}  # empty options, used if _parseArgs() fails
       
   514 
       
   515   try:
       
   516     action, options, args, parser = _parseArgs(argv[1:])
       
   517     exit_code, output = applyActionToFiles(action, args, **options)
       
   518 
       
   519     if output:  print '\n'.join(output)
       
   520 
       
   521   except Error, error:
       
   522     if not options.get('quiet_output'):
       
   523       print >>sys.stderr, '\nERROR: (%s: %s) %s\n' % (
       
   524         error.args[0], os.strerror(error.args[0]), error.args[1])
       
   525       print >>sys.stderr, parser.get_usage()
       
   526 
       
   527     exit_code = error.args[0]
       
   528 
       
   529   return exit_code
       
   530 
       
   531 
       
   532 if __name__ == '__main__':
       
   533   sys.exit(_main(sys.argv))