For those times when sed isn't enough, but awk is too much, there's munge.py...
authorTodd Larsen <tlarsen@google.com>
Fri, 14 Nov 2008 06:36:42 +0000
changeset 480 9b07ddeb1412
parent 479 50bab5e71a66
child 481 94834a1e6c01
For those times when sed isn't enough, but awk is too much, there's munge.py... Patch by: Todd Larsen
scripts/munge.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/munge.py	Fri Nov 14 06:36:42 2008 +0000
@@ -0,0 +1,533 @@
+#!/usr/bin/python2.5
+#
+# Copyright 2008 the Melange authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# __doc__ string is slightly unconventional because it is used as usage text
+"""%prog [OPTIONS] [FIND_REGEX] [REPLACE_FORMAT]
+
+Script to list, search, and modify files using Python regex patterns.
+
+OPTIONS:  optional command-line flags; see %prog --help
+
+FIND_REGEX:  an optional valid Python regular expression pattern;
+  if supplied, only files containing at least one match will be processed;
+  matching file paths will be printed; if supplied, REPLACE_FORMAT will be
+  used to convert the match groups into formatted output.
+
+REPLACE_FORMAT:  an optional valid Python format string;
+  FIND_REGEX must be supplied first if REPLACE_FORMAT is supplied;
+  positional arguments will be replaced with ordered groups from
+  FIND_REGEX matches, and named arguments will be replaced with named
+  groups from FIND_REGEX matches."""
+
+__authors__ = [
+  '"Todd Larsen" <tlarsen@google.com>',
+]
+
+
+import dircache
+import errno
+import os
+import optparse
+import re
+import sre_constants
+import sys
+
+
+class Error(Exception):
+  """Base class of all exceptions in this module.
+  """
+  pass
+
+
+def compileRegex(pattern):
+  """Compiles a Python regex pattern into a regex object.
+
+  Args:
+    pattern: valid Python regex pattern string, or an already-compiled
+      regex object (in which case this function is is a no-op)
+
+  Returns:
+    regex object compiled from pattern
+
+  Raises:
+    Error if pattern could not be compiled.
+  """
+  try:
+    return re.compile(pattern)
+  except sre_constants.error, error:
+    msg = 're.compile: %s\n%s' % (error.args[0], pattern)
+    raise Error(errno.EINVAL, msg)
+
+
+def findAll(text_to_search, pattern):
+  """Returns all matches of a regex in a string.
+  
+  Args:
+    text_to_search: string in which to find matches
+    pattern: Python regex pattern (or already-compiled regex object)
+      indicating which matches to retrieve
+
+  Returns:
+    a (possibly empty) list of the matches found, as strings 
+  """
+  matches = []
+
+  def _captureMatchText(match):
+    match_text = match.group()
+    matches.append(match_text)
+    return match_text
+
+  compileRegex(pattern).sub(_captureMatchText, text_to_search)
+
+  return matches 
+
+
+def getFileContents(file_path):
+  """Reads the contents of a file as a single string, then closes the file.
+  
+  Args:
+    file_path: path to the file to read its contents into a string
+    
+  Returns:
+    a single string containing the entire contents of the file
+  """
+  file_to_read = open(file_path)
+  file_contents = file_to_read.read()
+  file_to_read.close()
+  return file_contents
+
+
+def findAllInFile(file_path, pattern, *ignored_args, **ignored_kwargs): 
+  """Action to return a list of all pattern matches in a file.
+  
+  Args:
+    file_path: path of file to manipulate
+    pattern: see findAll()
+    *ignored_args: other positional arguments which are ignored
+      command-line arguments not used by this action callable
+    **ignored_kwargs: other keyword arguments which are ignored
+      command-line options not used by this action callable
+    
+  Returns:
+    two-tuple of boolean indicating if any match was found and a
+    (possibly empty) list of the matches found, as strings (to be used
+    as printable output of the action)
+  """
+  matches = findAll(getFileContents(file_path), pattern)
+
+  if matches:
+    found = True
+  else:
+    found = False
+
+  return found, matches
+
+
+def replaceAll(original, pattern, format):
+  """Substitutes formatted text for all matches in a string.
+  
+  Args:
+    original: original string in which to find and replace matches
+    pattern: Python regex pattern (or already-compiled regex object)
+      indicating which matches to replace
+    format: Python format string specifying how to format the
+      replacement text; how this format string is interpreted depends
+      on the contents of the pattern;  if the pattern contains:
+        named groups: format is expected to contain named format specifiers
+        unnamed groups: format is expected to contain exactly the same
+          number of unnamed format specifiers as the number of groups in
+          pattern
+        no groups: format is expected to contain a single format specifier
+          (in which case the entire match is supplied to it), or no format
+          specifier at all (in which case the "format" string simply
+          replaces the match with no substitutions from the match itself)
+
+  Returns:
+    two-tuple of the text with all matches replaced as specified by
+    pattern and format, and a list of the original matches, each followed
+    by its replacement 
+  """
+  matches_and_replacements = []
+
+  def _replaceWithFormat(match):
+    formatted_match = None
+
+    if match.groupdict():
+      try:
+        formatted_match = format % match.groupdict()
+      except TypeError:
+        pass
+
+    if (not formatted_match) and match.groups():
+      try:
+        formatted_match = format % match.groups()
+      except TypeError:
+        pass
+
+    if (not formatted_match):
+      try:
+        formatted_match = format % match.group()
+      except TypeError:
+        formatted_match = format
+
+    matches_and_replacements.append(match.group())
+    matches_and_replacements.append(formatted_match)
+    return formatted_match
+
+  replaced = compileRegex(pattern).sub(_replaceWithFormat, original)
+
+  return replaced, matches_and_replacements
+
+
+def writeAltFileIfExt(path, ext, contents):
+  """Writes a file if path and additional extension are supplied.
+
+  If path or ext are not supplied, no file is written.
+
+  Args:
+    path: path of file to be written, to which ext will be appended
+    ext: additional file extension that will be appended to path
+    contents: contents of file to be written, as a string
+  """
+  if (not path) or (not ext):
+    return
+
+  if ext.startswith('.'):
+    ext = ext[1:]
+ 
+  alt_path = '%s.%s' % (path, ext)
+  alt_file = open(alt_path, 'w')
+  alt_file.write(contents) 
+  alt_file.close()
+
+
+def replaceAllInFile(file_path, pattern, format,
+                     new_ext=None, backup_ext=None,
+                     overwrite_files=False,
+                     *ignored_args, **ignored_kwargs): 
+  """Substitutes formatted text for all matches in a file.
+  
+  Args:
+    file_path: path of file to manipulate
+    pattern, format: see replaceAll()
+    *ignored_args: other positional arguments which are ignored
+      command-line arguments not used by this action callable
+    **ignored_kwargs: other keyword arguments which are ignored
+      command-line options not used by this action callable
+    
+  Returns:
+    two-tuple of boolean indicating if any match was found and a
+    list of printable output text lines containing pairs of original
+    pattern matches each followed by the formatted replacement
+  """
+  original = getFileContents(file_path)
+
+  replaced, matches_and_replacements = replaceAll(
+    original, pattern, format)
+
+  if matches_and_replacements:
+    found = True
+    writeAltFileIfExt(file_path, new_ext, replaced)
+    writeAltFileIfExt(file_path, backup_ext, original)
+
+    if overwrite_files:
+      if replaced != original:
+        replaced_file = open(file_path, 'w')
+        replaced_file.write(replaced)
+        replaced_file.close()
+  else:
+    found = False
+
+  return found, matches_and_replacements
+
+
+def listFile(*ignored_args, **ignored_kwargs): 
+  """No-op action callable that ignores arguments and returns (True, []).
+  """
+  return True, []  # match only based on file names, which was done by caller
+
+
+def applyActionToFiles(action, action_args,
+                       start_path='', abs_path=False, files_pattern='',
+                       recurse_dirs=False, dirs_pattern='',
+                       follow_symlinks=False, quiet_output=False,
+                       hide_paths=False, **action_options):
+  """Applies a callable action to files, based on options and arguments.
+  
+  Args:
+    action: callable that expects a file path argument, positional arguments
+      (action_args), and keyword options from the command-line options dict;
+      and returns a "matched" boolean and a list of output strings
+    action_args: list of positional arguments, if any; passed to action
+      callable unchanged
+    start_path: required path of initial directory to visit
+    abs_path: optional boolean indicating to use absolute paths
+    files_pattern: required Python regex (object or pattern) which selects
+      which files to pass to the action callable
+    recurse_dirs: boolean indicating if subdirectories should be traversed 
+    dirs_pattern: Python regex (object or pattern) which selects which
+      subdirectories to traverse if recurse_dirs is True
+    follow_symlinks: boolean indicating if symlinks should be traversed
+    quiet_output: optional boolean indicating if output should be suppressed
+    hide_paths: optional boolean indicating to omit file paths from output
+    **action_options: remaining keyword arguments that are passed unchanged
+      to the action callable
+
+  Returns:
+    two-tuple containing an exit code and a (possibly empty) list of
+    output strings
+    
+  Raises:
+    Error exception if problems occur (file I/O, invalid regex, etc.).
+  """
+  exit_code = errno.ENOENT
+  output = []
+
+  start_path = os.path.expandvars(os.path.expanduser(start_path))
+
+  if abs_path:
+    start_path = os.path.abspath(start_path)
+
+  paths = [start_path]
+
+  files_regex = compileRegex(files_pattern)
+  
+  if recurse_dirs:
+    dirs_regex = compileRegex(dirs_pattern)
+
+  while paths:
+    sub_paths = []
+
+    for path in paths:
+      # expand iterator into an actual list and sort it
+      try:
+        items = dircache.listdir(path)[:]
+      except (IOError, OSError), error:
+        raise Error(error.args[0], '%s: %s' % (
+                    error.__class__.__name__, error.args[1]))
+
+      items.sort()
+
+      for item in items:
+        item_path = os.path.join(path, item)
+
+        if os.path.islink(item_path):
+          if not follow_symlinks:
+            continue  # do not follow symlinks (ignore them)
+
+        if os.path.isdir(item_path):
+          if recurse_dirs:
+            if dirs_regex.match(item):
+              sub_paths.append(item_path)
+          continue
+      
+        if files_regex.match(item):
+          try:
+            matched, found_output = action(item_path, *action_args,
+                                           **action_options)
+          except (IOError, OSError), error:
+            raise Error(error.args[0], '%s: %s' % (
+                        error.__class__.__name__, error.args[1]))
+
+          if matched:
+            exit_code = 0  # at least one matched file has now been found
+
+            if (not quiet_output) and (not hide_paths):
+              output.append(item_path)
+
+          if not quiet_output:
+            output.extend(found_output)
+
+    paths = sub_paths
+  
+  return exit_code, output
+
+
+class _ErrorOptionParser(optparse.OptionParser):
+  """Customized optparse.OptionParser that does not call sys.exit().
+  """
+
+  def error(self, msg):
+    """Raises an Error exception, instead of calling sys.exit().
+    """
+    raise Error(errno.EINVAL, msg)
+
+
+def _buildParser():
+  """Returns a custom OptionParser for parsing command-line arguments.
+  """
+  parser = _ErrorOptionParser(__doc__)
+
+  filter_group = optparse.OptionGroup(parser,
+    'File Options',
+    'Options used to select which files to process.')
+
+  filter_group.add_option(
+    '-f', '--files', dest='files_pattern', default='^.*$',
+    metavar='FILES_REGEX',
+    help=('Python regex pattern (*not* a glob!) defining files to process'
+          ' in each directory [default: %default]'))
+
+  filter_group.add_option(
+    '-F', '--follow', dest='follow_symlinks', default=False,
+    action='store_true',
+    help=('follow file and subdirectory symlinks (possibly *DANGEROUS*)'
+          ' [default: %default]'))
+
+  parser.add_option_group(filter_group)
+
+  dir_group = optparse.OptionGroup(parser,
+    'Directory Options',
+    'Options used to indicate which directories to traverse.')
+
+  dir_group.add_option(
+    '-s', '--start', dest='start_path', default=os.curdir, metavar='PATH',
+    help='directory in which to start processing files [default: %default]')
+
+  dir_group.add_option(
+    '-R', '--recursive', dest='recurse_dirs', default=False,
+    action='store_true',
+    help='recurse into subdirectories [default: %default]')
+
+  dir_group.add_option(
+    '-d', '--dirs', dest='dirs_pattern', default='^.*$',
+    metavar='SUBDIRS_REGEX',
+    help=('Python regex pattern (*not* a glob!) defining subdirectories to'
+          ' recurse into (if --recursive) [default: %default]'))
+
+  parser.add_option_group(dir_group)
+
+  output_group = optparse.OptionGroup(parser,
+    'Output Options',
+    'Options used to control program output.')
+
+  output_group.add_option(
+    '-a', '--abspath', dest='abs_path', default=False, action='store_true',
+    help=('output absolute paths instead of relative paths'
+          ' [default: %default]'))
+
+  output_group.add_option(
+    '-p', '--nopaths', dest='hide_paths', default=False, action='store_true',
+    help=('suppress printing of file path names for successfully matched'
+          ' files to stdout [default: %default]'))
+
+  output_group.add_option(
+    '-q', '--quiet', dest='quiet_output', default=False, action='store_true',
+    help=('suppress *all* printed output to stdout (but still perform'
+          ' replacements if specified) [default: %default]'))
+
+  parser.add_option_group(output_group)
+
+  replace_group = optparse.OptionGroup(parser,
+    'Replace Options',
+    'Options applied when matches in files are replaced with substitutions.'
+    ' (Only possible if REPLACE_FORMAT is supplied.)')
+
+  replace_group.add_option(
+    '-o', '--overwrite', dest='overwrite_files', default=False,
+    action='store_true',
+    help=('overwrite original files with formatted text substituted for'
+          ' matches [default: %default]'))  
+
+  replace_group.add_option(
+    '-b', '--backup', dest='backup_ext', default='', metavar='EXTENSION',
+    help=('if supplied, and file would be overwritten, backup original'
+          ' file with the supplied extension [default is no backups of'
+          ' overwritten files are kept]'))
+
+  replace_group.add_option(
+    '-n', '--new', dest='new_ext', default='', metavar='EXTENSION',
+    help=('if supplied, and file has matches and and is altered by'
+          ' substitutions, create a new file with the supplied extension'
+          ' [default is no new file is created]'))
+
+  parser.add_option_group(replace_group)
+
+  return parser
+
+
+def _parseArgs(cmd_line_args):
+  """Builds a command-line option parser and parses command-line arguments.
+  
+  Args:
+    cmd_line_args: command-line arguments, excluding the argv[0] program name
+    
+  Returns:
+    four-tuple of action callable, supplied command-line options (including
+    those defined by defaults in the command-line parser) as a dict,
+    remaining positional command-line arguments, and the parser itself
+    
+  Raises:
+    Error if problems occurred during commmand-line argument parsing.
+  """
+  parser = _buildParser()
+  options, args = parser.parse_args(args=cmd_line_args)
+
+  if not args:
+    # no FIND_REGEX or REPLACE_PATTERN supplied, so just match based
+    # on file name and subdirectory name patterns
+    action = listFile
+  elif len(args) == 1:
+    # FIND_REGEX supplied, but not REPLACE_PATTERN, so just match based
+    # on file name and subdirectory name patterns, and then on file
+    # contents
+    action = findAllInFile
+  elif len(args) == 2:
+    # FIND_REGEX and REPLACE_PATTERN both supplied, so match based
+    # on file name and subdirectory name patterns, and then do a find and
+    # replace on file contents
+    action = replaceAllInFile
+  else:
+    raise Error(errno.EINVAL,'too many (%d) arguments supplied:\n%s' % (
+                len(args), ' '.join(args)))
+
+  return action, vars(options), args, parser
+
+ 
+def _main(argv):
+  """Wrapper that catches exceptions, prints output, and returns exit status.
+  
+  Normal program output is printed to stdout.  Error output (including
+  exception text) is printed to stderr.
+  
+  Args:
+    argv: script arguments, usually sys.argv; argv[0] is expected to be the
+      program name
+      
+  Returns:
+    exit code suitable for sys.exit()
+  """
+  options = {}  # empty options, used if _parseArgs() fails
+
+  try:
+    action, options, args, parser = _parseArgs(argv[1:])
+    exit_code, output = applyActionToFiles(action, args, **options)
+
+    if output:  print '\n'.join(output)
+
+  except Error, error:
+    if not options.get('quiet_output'):
+      print >>sys.stderr, '\nERROR: (%s: %s) %s\n' % (
+        error.args[0], os.strerror(error.args[0]), error.args[1])
+      print >>sys.stderr, parser.get_usage()
+
+    exit_code = error.args[0]
+
+  return exit_code
+
+
+if __name__ == '__main__':
+  sys.exit(_main(sys.argv))