scripts/munge.py
author Lennard de Rijk <ljvderijk@gmail.com>
Fri, 14 Aug 2009 10:28:27 -0700
changeset 2776 54df459f33a9
parent 541 d572b0fb6bfe
permissions -rwxr-xr-x
Added tag v0-5-20090814p1 for changeset 4027acdbf91d

#!/usr/bin/python2.5
#
# Copyright 2008 the Melange authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# __doc__ string is slightly unconventional because it is used as usage text
"""%prog [OPTIONS] [FIND_REGEX] [REPLACE_FORMAT]

Script to list, search, and modify files using Python regex patterns.

OPTIONS:  optional command-line flags; see %prog --help

FIND_REGEX:  an optional valid Python regular expression pattern;
  if supplied, only files containing at least one match will be processed;
  matching file paths will be printed; if supplied, REPLACE_FORMAT will be
  used to convert the match groups into formatted output.

REPLACE_FORMAT:  an optional valid Python format string;
  FIND_REGEX must be supplied first if REPLACE_FORMAT is supplied;
  positional arguments will be replaced with ordered groups from
  FIND_REGEX matches, and named arguments will be replaced with named
  groups from FIND_REGEX matches."""

__authors__ = [
  '"Todd Larsen" <tlarsen@google.com>',
]


import dircache
import errno
import os
import optparse
import re
import sre_constants
import sys


class Error(Exception):
  """Base class of all exceptions in this module.
  """
  pass


def compileRegex(pattern):
  """Compiles a Python regex pattern into a regex object.

  Args:
    pattern: valid Python regex pattern string, or an already-compiled
      regex object (in which case this function is is a no-op)

  Returns:
    regex object compiled from pattern

  Raises:
    Error if pattern could not be compiled.
  """
  try:
    return re.compile(pattern)
  except sre_constants.error, error:
    msg = 're.compile: %s\n%s' % (error.args[0], pattern)
    raise Error(errno.EINVAL, msg)


def findAll(text_to_search, pattern):
  """Returns all matches of a regex in a string.
  
  Args:
    text_to_search: string in which to find matches
    pattern: Python regex pattern (or already-compiled regex object)
      indicating which matches to retrieve

  Returns:
    a (possibly empty) list of the matches found, as strings 
  """
  matches = []

  def _captureMatchText(match):
    match_text = match.group()
    matches.append(match_text)
    return match_text

  compileRegex(pattern).sub(_captureMatchText, text_to_search)

  return matches 


def getFileContents(file_path):
  """Reads the contents of a file as a single string, then closes the file.
  
  Args:
    file_path: path to the file to read its contents into a string
    
  Returns:
    a single string containing the entire contents of the file
  """
  file_to_read = open(file_path)
  file_contents = file_to_read.read()
  file_to_read.close()
  return file_contents


def findAllInFile(file_path, pattern, *ignored_args, **ignored_kwargs): 
  """Action to return a list of all pattern matches in a file.
  
  Args:
    file_path: path of file to manipulate
    pattern: see findAll()
    *ignored_args: other positional arguments which are ignored
      command-line arguments not used by this action callable
    **ignored_kwargs: other keyword arguments which are ignored
      command-line options not used by this action callable
    
  Returns:
    two-tuple of boolean indicating if any match was found and a
    (possibly empty) list of the matches found, as strings (to be used
    as printable output of the action)
  """
  matches = findAll(getFileContents(file_path), pattern)

  if matches:
    found = True
  else:
    found = False

  return found, matches


def replaceAll(original, pattern, format):
  """Substitutes formatted text for all matches in a string.
  
  Args:
    original: original string in which to find and replace matches
    pattern: Python regex pattern (or already-compiled regex object)
      indicating which matches to replace
    format: Python format string specifying how to format the
      replacement text; how this format string is interpreted depends
      on the contents of the pattern;  if the pattern contains:
        named groups: format is expected to contain named format specifiers
        unnamed groups: format is expected to contain exactly the same
          number of unnamed format specifiers as the number of groups in
          pattern
        no groups: format is expected to contain a single format specifier
          (in which case the entire match is supplied to it), or no format
          specifier at all (in which case the "format" string simply
          replaces the match with no substitutions from the match itself)

  Returns:
    two-tuple of the text with all matches replaced as specified by
    pattern and format, and a list of the original matches, each followed
    by its replacement 
  """
  matches_and_replacements = []

  def _replaceWithFormat(match):
    formatted_match = None

    if match.groupdict():
      try:
        formatted_match = format % match.groupdict()
      except TypeError:
        pass

    if (not formatted_match) and match.groups():
      try:
        formatted_match = format % match.groups()
      except TypeError:
        pass

    if (not formatted_match):
      try:
        formatted_match = format % match.group()
      except TypeError:
        formatted_match = format

    matches_and_replacements.append(match.group())
    matches_and_replacements.append(formatted_match)
    return formatted_match

  replaced = compileRegex(pattern).sub(_replaceWithFormat, original)

  return replaced, matches_and_replacements


def writeAltFileIfExt(path, ext, contents):
  """Writes a file if path and additional extension are supplied.

  If path or ext are not supplied, no file is written.

  Args:
    path: path of file to be written, to which ext will be appended
    ext: additional file extension that will be appended to path
    contents: contents of file to be written, as a string
  """
  if (not path) or (not ext):
    return

  if ext.startswith('.'):
    ext = ext[1:]
 
  alt_path = '%s.%s' % (path, ext)
  alt_file = open(alt_path, 'w')
  alt_file.write(contents) 
  alt_file.close()


def replaceAllInFile(file_path, pattern, format,
                     new_ext=None, backup_ext=None,
                     overwrite_files=False,
                     *ignored_args, **ignored_kwargs): 
  """Substitutes formatted text for all matches in a file.
  
  Args:
    file_path: path of file to manipulate
    pattern, format: see replaceAll()
    *ignored_args: other positional arguments which are ignored
      command-line arguments not used by this action callable
    **ignored_kwargs: other keyword arguments which are ignored
      command-line options not used by this action callable
    
  Returns:
    two-tuple of boolean indicating if any match was found and a
    list of printable output text lines containing pairs of original
    pattern matches each followed by the formatted replacement
  """
  original = getFileContents(file_path)

  replaced, matches_and_replacements = replaceAll(
    original, pattern, format)

  if matches_and_replacements:
    found = True
    writeAltFileIfExt(file_path, new_ext, replaced)
    writeAltFileIfExt(file_path, backup_ext, original)

    if overwrite_files:
      if replaced != original:
        replaced_file = open(file_path, 'w')
        replaced_file.write(replaced)
        replaced_file.close()
  else:
    found = False

  return found, matches_and_replacements


def listFile(*ignored_args, **ignored_kwargs): 
  """No-op action callable that ignores arguments and returns (True, []).
  """
  return True, []  # match only based on file names, which was done by caller


def applyActionToFiles(action, action_args,
                       start_path='', abs_path=False, files_pattern='',
                       recurse_dirs=False, dirs_pattern='',
                       follow_symlinks=False, quiet_output=False,
                       hide_paths=False, hide_text=False, **action_options):
  """Applies a callable action to files, based on options and arguments.
  
  Args:
    action: callable that expects a file path argument, positional arguments
      (action_args), and keyword options from the command-line options dict;
      and returns a "matched" boolean and a list of output strings
    action_args: list of positional arguments, if any; passed to action
      callable unchanged
    start_path: required path of initial directory to visit
    abs_path: optional boolean indicating to use absolute paths
    files_pattern: required Python regex (object or pattern) which selects
      which files to pass to the action callable
    recurse_dirs: boolean indicating if subdirectories should be traversed 
    dirs_pattern: Python regex (object or pattern) which selects which
      subdirectories to traverse if recurse_dirs is True
    follow_symlinks: boolean indicating if symlinks should be traversed
    quiet_output: optional boolean indicating if output should be suppressed
    hide_paths: optional boolean indicating to omit file paths from output
    hide_text: optional boolean indicating to omit find/replace text from
      output
    **action_options: remaining keyword arguments that are passed unchanged
      to the action callable

  Returns:
    two-tuple containing an exit code and a (possibly empty) list of
    output strings
    
  Raises:
    Error exception if problems occur (file I/O, invalid regex, etc.).
  """
  exit_code = errno.ENOENT
  output = []

  start_path = os.path.expandvars(os.path.expanduser(start_path))

  if abs_path:
    start_path = os.path.abspath(start_path)

  paths = [start_path]

  files_regex = compileRegex(files_pattern)
  
  if recurse_dirs:
    dirs_regex = compileRegex(dirs_pattern)

  while paths:
    sub_paths = []

    for path in paths:
      # expand iterator into an actual list and sort it
      try:
        items = dircache.listdir(path)[:]
      except (IOError, OSError), error:
        raise Error(error.args[0], '%s: %s' % (
                    error.__class__.__name__, error.args[1]))

      items.sort()

      for item in items:
        item_path = os.path.join(path, item)

        if os.path.islink(item_path):
          if not follow_symlinks:
            continue  # do not follow symlinks (ignore them)

        if os.path.isdir(item_path):
          if recurse_dirs:
            if dirs_regex.match(item):
              sub_paths.append(item_path)
          continue
      
        if os.path.isfile(item_path) and files_regex.match(item):
          try:
            matched, found_output = action(item_path, *action_args,
                                           **action_options)
          except (IOError, OSError), error:
            raise Error(error.args[0], '%s: %s' % (
                        error.__class__.__name__, error.args[1]))

          if matched:
            exit_code = 0  # at least one matched file has now been found

            if (not quiet_output) and (not hide_paths):
              output.append(item_path)

          if (not quiet_output) and (not hide_text):
            output.extend(found_output)

    paths = sub_paths
  
  return exit_code, output


class _ErrorOptionParser(optparse.OptionParser):
  """Customized optparse.OptionParser that does not call sys.exit().
  """

  def error(self, msg):
    """Raises an Error exception, instead of calling sys.exit().
    """
    raise Error(errno.EINVAL, msg)


def _buildParser():
  """Returns a custom OptionParser for parsing command-line arguments.
  """
  parser = _ErrorOptionParser(__doc__)

  filter_group = optparse.OptionGroup(parser,
    'File Options',
    'Options used to select which files to process.')

  filter_group.add_option(
    '-f', '--files', dest='files_pattern',
    default='(?!^.*\.pyc|.*\.ico|.*\.gif|.*\.png|.*\.jpg$)',
    metavar='FILES_REGEX',
    help=('Python regex pattern (*not* a glob!) defining files to process'
          ' in each directory [default: %default]'))

  filter_group.add_option(
    '-F', '--follow', dest='follow_symlinks', default=False,
    action='store_true',
    help=('follow file and subdirectory symlinks (possibly *DANGEROUS*)'
          ' [default: %default]'))

  parser.add_option_group(filter_group)

  dir_group = optparse.OptionGroup(parser,
    'Directory Options',
    'Options used to indicate which directories to traverse.')

  dir_group.add_option(
    '-s', '--start', dest='start_path', default=os.curdir, metavar='PATH',
    help='directory in which to start processing files [default: %default]')

  dir_group.add_option(
    '-R', '--recursive', dest='recurse_dirs', default=False,
    action='store_true',
    help='recurse into subdirectories [default: %default]')

  dir_group.add_option(
    '-d', '--dirs', dest='dirs_pattern', default='^[^.].*$',
    metavar='SUBDIRS_REGEX',
    help=('Python regex pattern (*not* a glob!) defining subdirectories to'
          ' recurse into (if --recursive) [default: %default]'))

  parser.add_option_group(dir_group)

  output_group = optparse.OptionGroup(parser,
    'Output Options',
    'Options used to control program output.')

  output_group.add_option(
    '-a', '--abspath', dest='abs_path', default=False, action='store_true',
    help=('output absolute paths instead of relative paths'
          ' [default: %default]'))

  output_group.add_option(
    '', '--nopaths', dest='hide_paths', default=False, action='store_true',
    help=('suppress printing of file path names for successfully matched'
          ' files to stdout [default: %default]'))

  output_group.add_option(
    '', '--notext', dest='hide_text', default=False, action='store_true',
    help=('suppress find/replace text output to stdout (but still print'
          ' paths if not --nopath, and still perform replacements if'
          ' specified) [default: %default]'))

  output_group.add_option(
    '-q', '--quiet', dest='quiet_output', default=False, action='store_true',
    help=('suppress *all* printed output to stdout (but still perform'
          ' replacements if specified) [default: %default]'))

  parser.add_option_group(output_group)

  replace_group = optparse.OptionGroup(parser,
    'Replace Options',
    'Options applied when matches in files are replaced with substitutions.'
    ' (Only possible if REPLACE_FORMAT is supplied.)')

  replace_group.add_option(
    '-o', '--overwrite', dest='overwrite_files', default=False,
    action='store_true',
    help=('overwrite original files with formatted text substituted for'
          ' matches [default: %default]'))  

  replace_group.add_option(
    '-b', '--backup', dest='backup_ext', default='', metavar='EXTENSION',
    help=('if supplied, and file would be overwritten, backup original'
          ' file with the supplied extension [default is no backups of'
          ' overwritten files are kept]'))

  replace_group.add_option(
    '-n', '--new', dest='new_ext', default='', metavar='EXTENSION',
    help=('if supplied, and file has matches and and is altered by'
          ' substitutions, create a new file with the supplied extension'
          ' [default is no new file is created]'))

  parser.add_option_group(replace_group)

  return parser


def _parseArgs(cmd_line_args):
  """Builds a command-line option parser and parses command-line arguments.
  
  Args:
    cmd_line_args: command-line arguments, excluding the argv[0] program name
    
  Returns:
    four-tuple of action callable, supplied command-line options (including
    those defined by defaults in the command-line parser) as a dict,
    remaining positional command-line arguments, and the parser itself
    
  Raises:
    Error if problems occurred during commmand-line argument parsing.
  """
  parser = _buildParser()
  options, args = parser.parse_args(args=cmd_line_args)

  if not args:
    # no FIND_REGEX or REPLACE_PATTERN supplied, so just match based
    # on file name and subdirectory name patterns
    action = listFile
  elif len(args) == 1:
    # FIND_REGEX supplied, but not REPLACE_PATTERN, so just match based
    # on file name and subdirectory name patterns, and then on file
    # contents
    action = findAllInFile
  elif len(args) == 2:
    # FIND_REGEX and REPLACE_PATTERN both supplied, so match based
    # on file name and subdirectory name patterns, and then do a find and
    # replace on file contents
    action = replaceAllInFile
  else:
    raise Error(errno.EINVAL,'too many (%d) arguments supplied:\n%s' % (
                len(args), ' '.join(args)))

  return action, vars(options), args, parser

 
def _main(argv):
  """Wrapper that catches exceptions, prints output, and returns exit status.
  
  Normal program output is printed to stdout.  Error output (including
  exception text) is printed to stderr.
  
  Args:
    argv: script arguments, usually sys.argv; argv[0] is expected to be the
      program name
      
  Returns:
    exit code suitable for sys.exit()
  """
  options = {}  # empty options, used if _parseArgs() fails
  parser = None

  try:
    action, options, args, parser = _parseArgs(argv[1:])
    exit_code, output = applyActionToFiles(action, args, **options)

    if output:  print '\n'.join(output)

  except Error, error:
    if not options.get('quiet_output'):
      print >>sys.stderr, '\nERROR: (%s: %s) %s\n' % (
        error.args[0], os.strerror(error.args[0]), error.args[1])

      if parser:
        print >>sys.stderr, parser.get_usage()

    exit_code = error.args[0]

  return exit_code


if __name__ == '__main__':
  sys.exit(_main(sys.argv))