eggs/zc.buildout-1.5.2-py2.6.egg/zc/buildout/download.txt
changeset 69 c6bca38c1cbf
equal deleted inserted replaced
68:5ff1fc726848 69:c6bca38c1cbf
       
     1 Using the download utility
       
     2 ==========================
       
     3 
       
     4 The ``zc.buildout.download`` module provides a download utility that handles
       
     5 the details of downloading files needed for a buildout run from the internet.
       
     6 It downloads files to the local file system, using the download cache if
       
     7 desired and optionally checking the downloaded files' MD5 checksum.
       
     8 
       
     9 We setup an HTTP server that provides a file we want to download:
       
    10 
       
    11 >>> server_data = tmpdir('sample_files')
       
    12 >>> write(server_data, 'foo.txt', 'This is a foo text.')
       
    13 >>> server_url = start_server(server_data)
       
    14 
       
    15 We also use a fresh directory for temporary files in order to make sure that
       
    16 all temporary files have been cleaned up in the end:
       
    17 
       
    18 >>> import tempfile
       
    19 >>> old_tempdir = tempfile.tempdir
       
    20 >>> tempfile.tempdir = tmpdir('tmp')
       
    21 
       
    22 
       
    23 Downloading without using the cache
       
    24 -----------------------------------
       
    25 
       
    26 If no download cache should be used, the download utility is instantiated
       
    27 without any arguments:
       
    28 
       
    29 >>> from zc.buildout.download import Download
       
    30 >>> download = Download()
       
    31 >>> print download.cache_dir
       
    32 None
       
    33 
       
    34 Downloading a file is achieved by calling the utility with the URL as an
       
    35 argument. A tuple is returned that consists of the path to the downloaded copy
       
    36 of the file and a boolean value indicating whether this is a temporary file
       
    37 meant to be cleaned up during the same buildout run:
       
    38 
       
    39 >>> path, is_temp = download(server_url+'foo.txt')
       
    40 >>> print path
       
    41 /.../buildout-...
       
    42 >>> cat(path)
       
    43 This is a foo text.
       
    44 
       
    45 As we aren't using the download cache and haven't specified a target path
       
    46 either, the download has ended up in a temporary file:
       
    47 
       
    48 >>> is_temp
       
    49 True
       
    50 
       
    51 >>> import tempfile
       
    52 >>> path.startswith(tempfile.gettempdir())
       
    53 True
       
    54 
       
    55 We are responsible for cleaning up temporary files behind us:
       
    56 
       
    57 >>> remove(path)
       
    58 
       
    59 When trying to access a file that doesn't exist, we'll get an exception:
       
    60 
       
    61 >>> try: download(server_url+'not-there') # doctest: +ELLIPSIS
       
    62 ... except: print 'download error'
       
    63 ... else: print 'woops'
       
    64 download error
       
    65 
       
    66 Downloading a local file doesn't produce a temporary file but simply returns
       
    67 the local file itself:
       
    68 
       
    69 >>> download(join(server_data, 'foo.txt'))
       
    70 ('/sample_files/foo.txt', False)
       
    71 
       
    72 We can also have the downloaded file's MD5 sum checked:
       
    73 
       
    74 >>> try: from hashlib import md5
       
    75 ... except ImportError: from md5 import new as md5
       
    76 
       
    77 >>> path, is_temp = download(server_url+'foo.txt',
       
    78 ...                          md5('This is a foo text.').hexdigest())
       
    79 >>> is_temp
       
    80 True
       
    81 >>> remove(path)
       
    82 
       
    83 >>> download(server_url+'foo.txt',
       
    84 ...          md5('The wrong text.').hexdigest())
       
    85 Traceback (most recent call last):
       
    86 ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
       
    87 
       
    88 The error message in the event of an MD5 checksum mismatch for a local file
       
    89 reads somewhat differently:
       
    90 
       
    91 >>> download(join(server_data, 'foo.txt'),
       
    92 ...               md5('This is a foo text.').hexdigest())
       
    93 ('/sample_files/foo.txt', False)
       
    94 
       
    95 >>> download(join(server_data, 'foo.txt'),
       
    96 ...          md5('The wrong text.').hexdigest())
       
    97 Traceback (most recent call last):
       
    98 ChecksumError: MD5 checksum mismatch for local resource at '/sample_files/foo.txt'.
       
    99 
       
   100 Finally, we can download the file to a specified place in the file system:
       
   101 
       
   102 >>> target_dir = tmpdir('download-target')
       
   103 >>> path, is_temp = download(server_url+'foo.txt',
       
   104 ...                          path=join(target_dir, 'downloaded.txt'))
       
   105 >>> print path
       
   106 /download-target/downloaded.txt
       
   107 >>> cat(path)
       
   108 This is a foo text.
       
   109 >>> is_temp
       
   110 False
       
   111 
       
   112 Trying to download a file in offline mode will result in an error:
       
   113 
       
   114 >>> download = Download(cache=None, offline=True)
       
   115 >>> download(server_url+'foo.txt')
       
   116 Traceback (most recent call last):
       
   117 UserError: Couldn't download 'http://localhost/foo.txt' in offline mode.
       
   118 
       
   119 As an exception to this rule, file system paths and URLs in the ``file``
       
   120 scheme will still work:
       
   121 
       
   122 >>> cat(download(join(server_data, 'foo.txt'))[0])
       
   123 This is a foo text.
       
   124 >>> cat(download('file:' + join(server_data, 'foo.txt'))[0])
       
   125 This is a foo text.
       
   126 
       
   127 >>> remove(path)
       
   128 
       
   129 
       
   130 Downloading using the download cache
       
   131 ------------------------------------
       
   132 
       
   133 In order to make use of the download cache, we need to configure the download
       
   134 utility differently. To do this, we pass a directory path as the ``cache``
       
   135 attribute upon instantiation:
       
   136 
       
   137 >>> cache = tmpdir('download-cache')
       
   138 >>> download = Download(cache=cache)
       
   139 >>> print download.cache_dir
       
   140 /download-cache/
       
   141 
       
   142 Simple usage
       
   143 ~~~~~~~~~~~~
       
   144 
       
   145 When using the cache, a file will be stored in the cache directory when it is
       
   146 first downloaded. The file system path returned by the download utility points
       
   147 to the cached copy:
       
   148 
       
   149 >>> ls(cache)
       
   150 >>> path, is_temp = download(server_url+'foo.txt')
       
   151 >>> print path
       
   152 /download-cache/foo.txt
       
   153 >>> cat(path)
       
   154 This is a foo text.
       
   155 >>> is_temp
       
   156 False
       
   157 
       
   158 Whenever the file is downloaded again, the cached copy is used. Let's change
       
   159 the file on the server to see this:
       
   160 
       
   161 >>> write(server_data, 'foo.txt', 'The wrong text.')
       
   162 >>> path, is_temp = download(server_url+'foo.txt')
       
   163 >>> print path
       
   164 /download-cache/foo.txt
       
   165 >>> cat(path)
       
   166 This is a foo text.
       
   167 
       
   168 If we specify an MD5 checksum for a file that is already in the cache, the
       
   169 cached copy's checksum will be verified:
       
   170 
       
   171 >>> download(server_url+'foo.txt', md5('The wrong text.').hexdigest())
       
   172 Traceback (most recent call last):
       
   173 ChecksumError: MD5 checksum mismatch for cached download
       
   174                from 'http://localhost/foo.txt' at '/download-cache/foo.txt'
       
   175 
       
   176 Trying to access another file at a different URL which has the same base name
       
   177 will result in the cached copy being used:
       
   178 
       
   179 >>> mkdir(server_data, 'other')
       
   180 >>> write(server_data, 'other', 'foo.txt', 'The wrong text.')
       
   181 >>> path, is_temp = download(server_url+'other/foo.txt')
       
   182 >>> print path
       
   183 /download-cache/foo.txt
       
   184 >>> cat(path)
       
   185 This is a foo text.
       
   186 
       
   187 Given a target path for the download, the utility will provide a copy of the
       
   188 file at that location both when first downloading the file and when using a
       
   189 cached copy:
       
   190 
       
   191 >>> remove(cache, 'foo.txt')
       
   192 >>> ls(cache)
       
   193 >>> write(server_data, 'foo.txt', 'This is a foo text.')
       
   194 
       
   195 >>> path, is_temp = download(server_url+'foo.txt',
       
   196 ...                          path=join(target_dir, 'downloaded.txt'))
       
   197 >>> print path
       
   198 /download-target/downloaded.txt
       
   199 >>> cat(path)
       
   200 This is a foo text.
       
   201 >>> is_temp
       
   202 False
       
   203 >>> ls(cache)
       
   204 - foo.txt
       
   205 
       
   206 >>> remove(path)
       
   207 >>> write(server_data, 'foo.txt', 'The wrong text.')
       
   208 
       
   209 >>> path, is_temp = download(server_url+'foo.txt',
       
   210 ...                          path=join(target_dir, 'downloaded.txt'))
       
   211 >>> print path
       
   212 /download-target/downloaded.txt
       
   213 >>> cat(path)
       
   214 This is a foo text.
       
   215 >>> is_temp
       
   216 False
       
   217 
       
   218 In offline mode, downloads from any URL will be successful if the file is
       
   219 found in the cache:
       
   220 
       
   221 >>> download = Download(cache=cache, offline=True)
       
   222 >>> cat(download(server_url+'foo.txt')[0])
       
   223 This is a foo text.
       
   224 
       
   225 Local resources will be cached just like any others since download caches are
       
   226 sometimes used to create source distributions:
       
   227 
       
   228 >>> remove(cache, 'foo.txt')
       
   229 >>> ls(cache)
       
   230 
       
   231 >>> write(server_data, 'foo.txt', 'This is a foo text.')
       
   232 >>> download = Download(cache=cache)
       
   233 
       
   234 >>> cat(download('file:' + join(server_data, 'foo.txt'), path=path)[0])
       
   235 This is a foo text.
       
   236 >>> ls(cache)
       
   237 - foo.txt
       
   238 
       
   239 >>> remove(cache, 'foo.txt')
       
   240 
       
   241 >>> cat(download(join(server_data, 'foo.txt'), path=path)[0])
       
   242 This is a foo text.
       
   243 >>> ls(cache)
       
   244 - foo.txt
       
   245 
       
   246 >>> remove(cache, 'foo.txt')
       
   247 
       
   248 However, resources with checksum mismatches will not be copied to the cache:
       
   249 
       
   250 >>> download(server_url+'foo.txt', md5('The wrong text.').hexdigest())
       
   251 Traceback (most recent call last):
       
   252 ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
       
   253 >>> ls(cache)
       
   254 
       
   255 >>> remove(path)
       
   256 
       
   257 Finally, let's see what happens if the download cache to be used doesn't exist
       
   258 as a directory in the file system yet:
       
   259 
       
   260 >>> Download(cache=join(cache, 'non-existent'))(server_url+'foo.txt')
       
   261 Traceback (most recent call last):
       
   262 UserError: The directory:
       
   263 '/download-cache/non-existent'
       
   264 to be used as a download cache doesn't exist.
       
   265 
       
   266 Using namespace sub-directories of the download cache
       
   267 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       
   268 
       
   269 It is common to store cached copies of downloaded files within sub-directories
       
   270 of the download cache to keep some degree of order. For example, zc.buildout
       
   271 stores downloaded distributions in a sub-directory named "dist". Those
       
   272 sub-directories are also known as namespaces. So far, we haven't specified any
       
   273 namespaces to use, so the download utility stored files directly inside the
       
   274 download cache. Let's use a namespace "test" instead:
       
   275 
       
   276 >>> download = Download(cache=cache, namespace='test')
       
   277 >>> print download.cache_dir
       
   278 /download-cache/test
       
   279 
       
   280 The namespace sub-directory hasn't been created yet:
       
   281 
       
   282 >>> ls(cache)
       
   283 
       
   284 Downloading a file now creates the namespace sub-directory and places a copy
       
   285 of the file inside it:
       
   286 
       
   287 >>> path, is_temp = download(server_url+'foo.txt')
       
   288 >>> print path
       
   289 /download-cache/test/foo.txt
       
   290 >>> ls(cache)
       
   291 d test
       
   292 >>> ls(cache, 'test')
       
   293 - foo.txt
       
   294 >>> cat(path)
       
   295 This is a foo text.
       
   296 >>> is_temp
       
   297 False
       
   298 
       
   299 The next time we want to download that file, the copy from inside the cache
       
   300 namespace is used. To see this clearly, we put a file with the same name but
       
   301 different content both on the server and in the cache's root directory:
       
   302 
       
   303 >>> write(server_data, 'foo.txt', 'The wrong text.')
       
   304 >>> write(cache, 'foo.txt', 'The wrong text.')
       
   305 
       
   306 >>> path, is_temp = download(server_url+'foo.txt')
       
   307 >>> print path
       
   308 /download-cache/test/foo.txt
       
   309 >>> cat(path)
       
   310 This is a foo text.
       
   311 
       
   312 >>> rmdir(cache, 'test')
       
   313 >>> remove(cache, 'foo.txt')
       
   314 >>> write(server_data, 'foo.txt', 'This is a foo text.')
       
   315 
       
   316 Using a hash of the URL as the filename in the cache
       
   317 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       
   318 
       
   319 So far, the base name of the downloaded file read from the URL has been used
       
   320 for the name of the cached copy of the file. This may not be desirable in some
       
   321 cases, for example when downloading files from different locations that have
       
   322 the same base name due to some naming convention, or if the file content
       
   323 depends on URL parameters. In such cases, an MD5 hash of the complete URL may
       
   324 be used as the filename in the cache:
       
   325 
       
   326 >>> download = Download(cache=cache, hash_name=True)
       
   327 >>> path, is_temp = download(server_url+'foo.txt')
       
   328 >>> print path
       
   329 /download-cache/09f5793fcdc1716727f72d49519c688d
       
   330 >>> cat(path)
       
   331 This is a foo text.
       
   332 >>> ls(cache)
       
   333 - 09f5793fcdc1716727f72d49519c688d
       
   334 
       
   335 The path was printed just to illustrate matters; we cannot know the real
       
   336 checksum since we don't know which port the server happens to listen at when
       
   337 the test is run, so we don't actually know the full URL of the file. Let's
       
   338 check that the checksum actually belongs to the particular URL used:
       
   339 
       
   340 >>> path.lower() == join(cache, md5(server_url+'foo.txt').hexdigest()).lower()
       
   341 True
       
   342 
       
   343 The cached copy is used when downloading the file again:
       
   344 
       
   345 >>> write(server_data, 'foo.txt', 'The wrong text.')
       
   346 >>> (path, is_temp) == download(server_url+'foo.txt')
       
   347 True
       
   348 >>> cat(path)
       
   349 This is a foo text.
       
   350 >>> ls(cache)
       
   351 - 09f5793fcdc1716727f72d49519c688d
       
   352 
       
   353 If we change the URL, even in such a way that it keeps the base name of the
       
   354 file the same, the file will be downloaded again this time and put in the
       
   355 cache under a different name:
       
   356 
       
   357 >>> path2, is_temp = download(server_url+'other/foo.txt')
       
   358 >>> print path2
       
   359 /download-cache/537b6d73267f8f4447586989af8c470e
       
   360 >>> path == path2
       
   361 False
       
   362 >>> path2.lower() == join(cache, md5(server_url+'other/foo.txt').hexdigest()).lower()
       
   363 True
       
   364 >>> cat(path)
       
   365 This is a foo text.
       
   366 >>> cat(path2)
       
   367 The wrong text.
       
   368 >>> ls(cache)
       
   369 - 09f5793fcdc1716727f72d49519c688d
       
   370 - 537b6d73267f8f4447586989af8c470e
       
   371 
       
   372 >>> remove(path)
       
   373 >>> remove(path2)
       
   374 >>> write(server_data, 'foo.txt', 'This is a foo text.')
       
   375 
       
   376 
       
   377 Using the cache purely as a fall-back
       
   378 -------------------------------------
       
   379 
       
   380 Sometimes it is desirable to try downloading a file from the net if at all
       
   381 possible, and use the cache purely as a fall-back option when a server is
       
   382 down or if we are in offline mode. This mode is only in effect if a download
       
   383 cache is configured in the first place:
       
   384 
       
   385 >>> download = Download(cache=cache, fallback=True)
       
   386 >>> print download.cache_dir
       
   387 /download-cache/
       
   388 
       
   389 A downloaded file will be cached:
       
   390 
       
   391 >>> ls(cache)
       
   392 >>> path, is_temp = download(server_url+'foo.txt')
       
   393 >>> ls(cache)
       
   394 - foo.txt
       
   395 >>> cat(cache, 'foo.txt')
       
   396 This is a foo text.
       
   397 >>> is_temp
       
   398 False
       
   399 
       
   400 If the file cannot be served, the cached copy will be used:
       
   401 
       
   402 >>> remove(server_data, 'foo.txt')
       
   403 >>> try: Download()(server_url+'foo.txt') # doctest: +ELLIPSIS
       
   404 ... except: print 'download error'
       
   405 ... else: print 'woops'
       
   406 download error
       
   407 >>> path, is_temp = download(server_url+'foo.txt')
       
   408 >>> cat(path)
       
   409 This is a foo text.
       
   410 >>> is_temp
       
   411 False
       
   412 
       
   413 Similarly, if the file is served but we're in offline mode, we'll fall back to
       
   414 using the cache:
       
   415 
       
   416 >>> write(server_data, 'foo.txt', 'The wrong text.')
       
   417 >>> get(server_url+'foo.txt')
       
   418 'The wrong text.'
       
   419 
       
   420 >>> offline_download = Download(cache=cache, offline=True, fallback=True)
       
   421 >>> path, is_temp = offline_download(server_url+'foo.txt')
       
   422 >>> print path
       
   423 /download-cache/foo.txt
       
   424 >>> cat(path)
       
   425 This is a foo text.
       
   426 >>> is_temp
       
   427 False
       
   428 
       
   429 However, when downloading the file normally with the cache being used in
       
   430 fall-back mode, the file will be downloaded from the net and the cached copy
       
   431 will be replaced with the new content:
       
   432 
       
   433 >>> cat(download(server_url+'foo.txt')[0])
       
   434 The wrong text.
       
   435 >>> cat(cache, 'foo.txt')
       
   436 The wrong text.
       
   437 
       
   438 When trying to download a resource whose checksum does not match, the cached
       
   439 copy will neither be used nor overwritten:
       
   440 
       
   441 >>> write(server_data, 'foo.txt', 'This is a foo text.')
       
   442 >>> download(server_url+'foo.txt', md5('The wrong text.').hexdigest())
       
   443 Traceback (most recent call last):
       
   444 ChecksumError: MD5 checksum mismatch downloading 'http://localhost/foo.txt'
       
   445 >>> cat(cache, 'foo.txt')
       
   446 The wrong text.
       
   447 
       
   448 
       
   449 Configuring the download utility from buildout options
       
   450 ------------------------------------------------------
       
   451 
       
   452 The configuration options explained so far derive from the build logic
       
   453 implemented by the calling code. Other options configure the download utility
       
   454 for use in a particular project or buildout run; they are read from the
       
   455 ``buildout`` configuration section. The latter can be passed directly as the
       
   456 first argument to the download utility's constructor.
       
   457 
       
   458 The location of the download cache is specified by the ``download-cache``
       
   459 option:
       
   460 
       
   461 >>> download = Download({'download-cache': cache}, namespace='cmmi')
       
   462 >>> print download.cache_dir
       
   463 /download-cache/cmmi
       
   464 
       
   465 If the ``download-cache`` option specifies a relative path, it is understood
       
   466 relative to the current working directory, or to the buildout directory if
       
   467 that is given:
       
   468 
       
   469 >>> download = Download({'download-cache': 'relative-cache'})
       
   470 >>> print download.cache_dir
       
   471 /sample-buildout/relative-cache/
       
   472 
       
   473 >>> download = Download({'directory': join(sample_buildout, 'root'),
       
   474 ...                      'download-cache': 'relative-cache'})
       
   475 >>> print download.cache_dir
       
   476 /sample-buildout/root/relative-cache/
       
   477 
       
   478 Keyword parameters take precedence over the corresponding options:
       
   479 
       
   480 >>> download = Download({'download-cache': cache}, cache=None)
       
   481 >>> print download.cache_dir
       
   482 None
       
   483 
       
   484 Whether to assume offline mode can be inferred from either the ``offline`` or
       
   485 the ``install-from-cache`` option. As usual with zc.buildout, these options
       
   486 must assume one of the values 'true' and 'false':
       
   487 
       
   488 >>> download = Download({'offline': 'true'})
       
   489 >>> download.offline
       
   490 True
       
   491 
       
   492 >>> download = Download({'offline': 'false'})
       
   493 >>> download.offline
       
   494 False
       
   495 
       
   496 >>> download = Download({'install-from-cache': 'true'})
       
   497 >>> download.offline
       
   498 True
       
   499 
       
   500 >>> download = Download({'install-from-cache': 'false'})
       
   501 >>> download.offline
       
   502 False
       
   503 
       
   504 These two options are combined using logical 'or':
       
   505 
       
   506 >>> download = Download({'offline': 'true', 'install-from-cache': 'false'})
       
   507 >>> download.offline
       
   508 True
       
   509 
       
   510 >>> download = Download({'offline': 'false', 'install-from-cache': 'true'})
       
   511 >>> download.offline
       
   512 True
       
   513 
       
   514 The ``offline`` keyword parameter takes precedence over both the ``offline``
       
   515 and ``install-from-cache`` options:
       
   516 
       
   517 >>> download = Download({'offline': 'true'}, offline=False)
       
   518 >>> download.offline
       
   519 False
       
   520 
       
   521 >>> download = Download({'install-from-cache': 'false'}, offline=True)
       
   522 >>> download.offline
       
   523 True
       
   524 
       
   525 
       
   526 Regressions
       
   527 -----------
       
   528 
       
   529 MD5 checksum calculation needs to be reliable on all supported systems, which
       
   530 requires text files to be treated as binary to avoid implicit line-ending
       
   531 conversions:
       
   532 
       
   533 >>> text = 'First line of text.\r\nSecond line.\r\n'
       
   534 >>> f = open(join(server_data, 'foo.txt'), 'wb')
       
   535 >>> f.write(text)
       
   536 >>> f.close()
       
   537 >>> path, is_temp = Download()(server_url+'foo.txt', md5(text).hexdigest())
       
   538 >>> remove(path)
       
   539 
       
   540 
       
   541 Clean up
       
   542 --------
       
   543 
       
   544 We should have cleaned up all temporary files created by downloading things:
       
   545 
       
   546 >>> ls(tempfile.tempdir)
       
   547 
       
   548 Reset the global temporary directory:
       
   549 
       
   550 >>> tempfile.tempdir = old_tempdir