# Copyright (c) 2010 ActiveState Software Inc. All rights reserved.

"""
    pypm.common.repository
    ~~~~~~~~~~~~~~~~~~~~~~

    Code related to repository functions
"""

import os
from os import path as P
import logging
import gzip
from hashlib import md5
from contextlib import closing
from collections import namedtuple
import tempfile
import shutil
import re
import json
from fnmatch import fnmatch

import pkg_resources
import six.moves
from applib import sh
from applib import log
from applib import textui
from applib import _cmdln as cmdln
from applib import _simpledb 
# from applib.misc import require_option
from applib.misc import xjoin

from pypm.common import net
from pypm.common import python
from pypm.common import supported
from pypm.common.util import BareDateTime
from pypm.common.util import ConfigParserNamedLists
from pypm.common.util import dlocked
from pypm.common.util import url_join
from pypm.common.util import path_to_url
from pypm.common.net import URLProperties
from pypm.common.package import BinaryPackage
from pypm.common.package import RepoPackage
from pypm.common.package import PackageFile

LOG = logging.getLogger(__name__)


class MultiRepositoryConfig(object):
    """Represent a config file with multiple repositories

    see src/pypm/client/client.conf[repository] for an example; also see the
    class docstring for pypm.common.util.ConfigParserNamedLists
    """

    RepositoryLocation = namedtuple('RepositoryLocation', 'name location')

    def __init__(self, *configfiles):
        self.cfg = six.moves.configparser.SafeConfigParser()
        self.cfg.read(configfiles)

        self.namedlists = ConfigParserNamedLists(
            self.cfg.items('repository'),
            self.RepositoryLocation,
            self._is_location)

    def get_locations(self, repository_locations):
        assert isinstance(repository_locations, str)
        locations = []

        for l in ConfigParserNamedLists.VALUE_SEP.split(repository_locations):
            if self._is_location(l):
                locations.append(self.RepositoryLocation('<unnamed>', l))
            elif l not in self.namedlists.mapping:
                raise ValueError(
                    'repository name "{0}" is not found; available ones are: {1}'.format(
                        l, self.namedlists.mapping.keys()))
            else:
                locations.extend(self.namedlists.mapping[l])

        return locations

    def get_urls(self, repository_locations):
        return [path_to_url(l.location) if '://' not in l.location else l.location
                for l in self.get_locations(repository_locations)]

    def _is_location(self, l):
        """Return True if `l` is a path or url"""
        return '/' in l or '\\' in l
 
    

class MultiRepositorySet(object):
    """A group of mutually-exclusive repository sets (``RepositorySet``)

    Example::

      rex/free
      rex/be
      rex/testing
      rex/stable

    Each repository set contains packages that are not in the others. The
    pattern specifying the mapping for packages -> repository-set is defined in
    a config file (see etc/activestate.conf:[packages]mapping). The same config
    file also defines the names ('free', 'be', ..) for the repository sets.
    """
    
    def __init__(self, path, configfile):
        """
        - path: base path to the multirepositoryset
        - configfile: config file defining repository names and mapping
        """
        self.path = path
        self.mrc = MultiRepositoryConfig(configfile)
        self.reposets = {} # name -> RepositorySet
        self.mapping = []  # [(pattern, reponame), ...]
        self._init()

    def _init(self):
        for name, repository_locations in self.mrc.namedlists.mapping.items():
            assert len(repository_locations) == 1
            repository_location = repository_locations[0]
            self.reposets[name] = RepositorySet(
                P.join(self.path, name), name, repository_location.location)

        for line in self.mrc.cfg.get('packages', 'mapping').split('\n'):
            line = line.strip()
            if line and not line.startswith('#'):
                pattern, repo = line.split()
                self.mapping.append((pattern.strip(), repo.strip()))
            
    def get_repository(self, bpkg):
        """Return the repository where ``bpkg`` is mapped to.
        
        Pick the appropriate repository respected by the mapping
        (``self.mapping``). Name of the bpkg is used in matching the patterns
        in self.mapping. pyver and osarch of bpkg is finally used in picking up
        the underlying repository in the choosen reposet.
        
        - bpkg: An instance of ``pypm.common.package.BinaryPackage``
        """
        bpkg_fn = bpkg.make_filename()
        for pattern, name in self.mapping:
            if fnmatch(bpkg_fn, pattern):
                return self.reposets[name].get_repository(
                    bpkg.pyver, bpkg.osarch
                )
    
    def __iter__(self):
        """Iter over available repositories"""
        for name in sorted(self.reposets.keys()):
            for repo in self.reposets[name]:
                yield repo

    def __str__(self):
        return "<MultiRepositorySet: reposets=\n  %s\n/>" % '\n  '.join(
            [str(x) for x in self.reposets.items()])
    
#
# RepositorySet
#

class RepositorySet(object):
    """A set of repositories

    This set includes repositories for each platform/pyver combination. An
    example would be the 'free' repository set::

        $ tree -L 2  free/
        free/
        |-- 2.6
        |   |-- linux-x86
        |   |-- linux-x86_64
        |   |-- macosx
        |   |-- win32-x86
        |   `-- win64-x64
        |-- 2.7
        |   |-- linux-x86
        |   |-- linux-x86_64
        |   |-- macosx
        |   |-- win32-x86
        |   `-- win64-x64
        `-- 3.1
            |-- linux-x86
            |-- linux-x86_64
            |-- macosx
            |-- win32-x86
            `-- win64-x64
    """

    def __init__(self, path, name, url):
        self.path = path
        self.name = name
        self.url = url

    def create_repositories(self):
        """Create repositories for supported configuration"""
        for osarch in supported.os_architectures:
            for pyver in supported.py_versions:
                sh.mkdirs(P.join(self.path, pyver, osarch))

    def get_repository(self, pyver, osarch, autocreate=False):
        path = xjoin(self.path, pyver, osarch)
        url = '/'.join([self.url, pyver, osarch])
        if autocreate:
            # create, if does not already exist
            sh.mkdirs(path)
            
        return Repository(path, self.name, pyver, osarch, url)

    def __iter__(self):
        """Iter over all supported repositories
        
        If a supported repository does not exists, simply create the repository
        directory before returning it.
        """
        for pyver in supported.py_versions:
            for osarch in supported.os_architectures:
                yield self.get_repository(pyver, osarch, autocreate=True)

    def __str__(self):
        return '{0}<{1.path}, {1.url}>'.format(self.__class__.__name__, self)
    __repr__ = __str__


#
# Repository
#

class Repository(object):
    """Repository directory containing packages and index"""

    def __init__(self, path, name, pyver, osarch, url):
        from pypm.web.uptree import UpTree  # XXX: put uptree out of 'web'
        self.path = path
        self.name = name
        self.pyver = pyver
        self.osarch = osarch
        self.url = url
        self.uptree = UpTree(
            self.path,
            content_cache_filenames=['info.json', 'imports'],
            mtime_cache_filenames=['log'],
        )
        
    def _update_uptree(self):
        counters = self.uptree.update()  # potentially long-operation
        if any(counters.values()):
            LOG.info('Uptree was updated: %s', counters)
        return counters

    def find_packages(self):
        """Return available packages in the repository"""
        self._update_uptree()
        return [p for p in self.uptree.get_files() if p.endswith('.pypm')]

    def find_all_packages(self):
        """Return all packages whether succeeded or failed

        Return a list of tuples of the form:

          (pkgfile, pkgdir, succeeded)

        where:

          pkgfile   - path to the package file (may not exist)
          pkgdir    - path to the package dir (.d/ directory)
          succeeded - True of the pkgfile exists
        """
        self._update_uptree()
        processed = set()
        for fil in textui.ProgressBar.iterate(self.uptree.get_files(), note='Files'):
            if fil.endswith('.pypm'):
                pkgfile, pkgdir, succeeded = fil, fil + '.d', True
            elif fil.endswith('.pypm.d'):
                pkgfile, pkgdir = fil[:-2], fil
                succeeded = self.uptree.exists(pkgfile)
            else:
                continue
            if pkgfile not in processed:
                processed.add(pkgfile)
                yield pkgfile, pkgdir, succeeded

    def __str__(self):
        return '{0}<{1.path}>'.format(self.__class__.__name__, self)


# 
# RepositoryIndex
# 

class RepoPackageDatabase(_simpledb.SimpleDatabase):
    """A database containing instances of ``pypm.common.package.RepoPackage``"""
            
_simpledb.setup(RepoPackageDatabase, RepoPackage,
         primary_keys=['name', 'version',
                       'pyver', 'osarch',
                       'pkg_version'])

class RepositoryIndex(object):
    """Index of packages in a repository

    Repositories can optionally have index files .. which are especially useful
    when the repository is only available remotely over the wire.

    The index file contains a list of all packages along with their metadata and
    the relative location of the package file.

    There is just one index file:

      - index | index.gz - list of packages that are available (sqlite)
    """

    def __init__(self, repository):
        assert isinstance(repository, Repository)
        self.repository = repository

    def get_index(self):
        """Return an existing index as ``RepoPackageDatabase``

        Returned index database corresponds to a temporary file (as the index
        file is originally compressed; it needs to be extracted to a temporary
        location) .. hence any attempts to "write" on the returned index
        database will be futile.
        """
        return RepoPackageDatabase(_ungzip(xjoin(self.repository.path, 'index.gz')))

    def generate_index(self):
        """Generated the repository index file (`index.gz`)

        index.gz is the compressed sqlite index containing all of the succeeded
        packages in the repository pool.

        Return the number of packages added to the repository index.
        """
        from pypm.grail.package import PackageShare
        assert P.exists(self.repository.path)
        idx_path = xjoin(self.repository.path, 'index')
        idx_gz_path = idx_path + '.gz'
        
        sh.rm(idx_path)
        db = RepoPackageDatabase(idx_path, touch=True)
        
        # Tag BE packages; so client may use it to determine if a package is
        # available only to BE customers or not.
        # See also: RepoPackage.requires_be_license property
        pkgtags = 'be' if self.repository.name == 'be' else ''

        # Load package-specific data from share/p/*
        pkgdata = dict([(s.name, s) for s in PackageShare.all()])

        with closing(db):
            LOG.debug('finding packages in %s', self.repository.path)
            packages = self.repository.find_packages()

            LOG.debug('processing %d packages', len(packages))
            rpkg_list = [
                RepoPackage.create_from(
                    BinaryPackage(**self._read_info_json(pkgfile)),
                    relpath=P.relpath(pkgfile, self.repository.path),
                    tags=pkgtags)
                for pkgfile in textui.ProgressBar.iterate(packages, note="Package")
            ]
            
            for rpkg in rpkg_list:
                # Optimize index size by removing the "description" field.
                # PyPI's descriptions are typically very long - see
                # http://pypi.python.org/pypi/zc.buildout for example - hence we
                # must remove them from the index.
                rpkg.description = ''
                if rpkg.name in pkgdata:
                    # Add package notes to the description^Wextra field
                    # See pypm.common.package.RepoPackage.FIELDS to understand
                    # why we are abusing this field.
                    notes = list(pkgdata[rpkg.name].get_notes_for(
                        pyver=rpkg.pyver, osarch=rpkg.osarch))

                    rpkg.description = json.dumps({
                        'notes': notes
                    })
                    LOG.debug('Patching "description" field for %s', rpkg)

            # keep only the latest pkg_version in index
            LOG.debug("pruning older pkg_version's")
            rpkg_list = _prune_older_binary_releases(rpkg_list)
            LOG.debug('.. resulting in %d packages', len(rpkg_list))

            LOG.info('  writing index (please wait) ...')
            with db.transaction() as session:
                session.add_all(rpkg_list)
                session.commit()
                session.close()

        LOG.info('  compressing index: ...%s%s',
                 os.path.basename(idx_gz_path),
                 (' (%d)' % len(rpkg_list)) if rpkg_list else '')
        sh.rm(idx_gz_path)
        with closing(gzip.open(idx_gz_path, 'wb')) as f:
            f.write(open(idx_path, 'rb').read())
        sh.rm(idx_path)

        return len(rpkg_list)

    def _read_info_json(self, pypm_file):
        """Read cached info.json (as dict) from the .d/ directory
        
        If cached version is missing, read from the package file itself, which
        would be an expensive operation.
        """
        info_json_loc = xjoin(pypm_file + '.d', 'info.json')

        try:
            s = self.repository.uptree.open_and_read(info_json_loc)
        except IOError as e:
            # There seems to be no .d/info.json file; perhaps this is a
            # 'custom' that is not managed by pypm-builder. So let's extract
            # info.json from the package file (.pypm) even though that is
            # expensive (so we will also warn the user)
            LOG.info(
                'ALERT: Cache file (.d/info.json) missing; extracting from %s', pypm_file)
            s = PackageFile(pypm_file).retrieve_info_json()
            
        d = json.loads(s)

        # It is not clear whether info.json's "name" field is canonical (i.e.,
        # lower case safe version of name, that is guarateed to be same).
        # Therefore, we do one final conversion there.
        d['name'] = pkg_resources.safe_name(d['name']).lower()  
        return d


def _prune_older_binary_releases(packages):
    """Prune all older releases (pkg_version) of the package"""
    releases = {}

    for pkg in packages:
        key = (pkg.full_name, pkg.version, pkg.pyver, pkg.osarch)
        if key in releases:
            prevrel = releases[key]
            if pkg.pkg_version == prevrel.pkg_version:
                raise IOError('duplicate packages in repository: %s; %s' % \
                              (prevrel.relpath, pkg.relpath))
            elif pkg.pkg_version > prevrel.pkg_version:
                releases[key] = pkg
        else:
            releases[key] = pkg

    return releases.values()
        

#
# classes for managing remote repositories
#

class RemoteRepositorySet(object):
    """Represent a remotely available RepositorySet"""

    def __init__(self, url):
        self.url = url

    def get_repository(self, pyenv, osarch):
        assert isinstance(pyenv, python.BasePythonEnvironment)
        # Use full ActivePython version instead of pyver for ActiveState
        # repositories, and let the AS server handle the redirection. We do this
        # to control the repository URL for each and every ActivePython
        # release. For eg., use /2.6.6.16/... instead of /2.6/... even though
        # the actual repository path in our server uses 2.6.
        if re.search(r'pypm.*\.activestate\.com', self.url):
            ver = pyenv.apyver
        else:
            ver = pyenv.pyver
        return RemoteRepository(url_join(self.url, [ver, osarch]))

class RemoteRepository(object):
    """Represent a remotely available Repository"""

    # Filename of the actual remote index file
    REMOTE_INDEX_FILENAME = "index.gz"

    def __init__(self, url):
        self.url = url

    def download_index(self, target_file, force, verbose=True, interactive=True):
        """Download repository index unless it was downloaded recently (Etag)

        - force: Do not use cache; always download
        - verbose: If False, try not to print (LOG.info) anything to console
          unless an actual download happens.

        Return True if download actually happened.
        """
        def start_info(status):
            if status == 'Hit' and not verbose:
                return None
            return '%s: [%s] :repository-index:' % (
                status,
                six.moves.urlparse(self.url).netloc)
        
        index_url = url_join(self.url, [self.REMOTE_INDEX_FILENAME])
        try:
            idxgz_file, downloaded = net.download_file(index_url, P.dirname(target_file), {
                'use_cache': not force,
                'save_properties': True,
                'start_info': start_info,
                }, interactive=interactive)
            
            if not downloaded:
                # index was not updated
                return False
        except six.moves.HTTPError as e:
            if e.code == 404: # Not Found
                raise ValueError(
                    '{0.url} does not appear to be a valid repository '
                    'because {1} is missing.'.format(self, index_url))
            raise
        
        # extract index.gz (REMOTE_INDEX_FILENAME) to index (target_file)
        try:
            with closing(gzip.open(idxgz_file, 'rb')) as f:
                with open(target_file, 'wb') as f2:
                    f2.write(f.read())
        except:
            # If an error occurs during extraction, simply delete the index file
            # (so that it will get properly synced during next sync)
            sh.rm(target_file)
            sh.rm(idxgz_file)
            raise
        
        return True

    def get_unique_id(self):
        """Return an alpha-numeric ID unique to this repository (URL)"""
        return md5(self.url.encode('utf8')).hexdigest()

    def __str__(self):
        return '{0.__class__.__name__}<{0.url}>'.format(self)

class RemoteRepositoryManager(object):
    """Manage multiple remote repositories with a local cache"""

    def __init__(self, path):
        # local cache directory where repository indexes will be stored
        self.path = path
        sh.mkdirs(path)

    def get_index_db(self, remote_repository):
        """Return the index database for remote repository

        If necessary, download the index automatically
        """
        return RepoPackageDatabase(
            self.get_local_index_path(
                remote_repository))

    def sync_repository(self, remote_repository, force, verbose=True, interactive=True):
        """Sync the cache for a remote repository"""
        with dlocked(self.path):
            assert isinstance(remote_repository, RemoteRepository)
            idx_path = self.get_local_index_path(remote_repository)
            sh.mkdirs(P.dirname(idx_path))
            return remote_repository.download_index(
                idx_path, force, verbose, interactive=interactive)

    def get_remote_index_last_download_attempt_time(self, remote_repository):
        """Return the UTC datetime when the index file was last *attempted* to
        download

        The download may not have happened, however, due to unmodified ETag.

        If no index is available (as in, 'pypm sync' or an equivalent was never
        run in the first place), return None
        """
        original_index_file = P.join(
            P.dirname(self.get_local_index_path(remote_repository)),
            remote_repository.REMOTE_INDEX_FILENAME)
        urlprops = URLProperties(original_index_file).load()
        if urlprops:
            return BareDateTime.to_datetime(
                urlprops.custom['last_attempt_utc'])

    def get_local_index_path(self, remote_repository):
        return xjoin(
            self.path,
            remote_repository.get_unique_id(),
            'index')


#
# cmdln
# 

@cmdln.option('-r', '--repository-path',
              help='Local repository path')
@cmdln.option('-R', '--multi-repository-set-path',
              help='MultiRepositorySet path')
class Commands(log.LogawareCmdln):
    name = "pypm-repository"

    def initialize(self):
        # require_option(self.options, 'configfile')
        # require_option(self.options, 'multi_repository_set_path')
        if self.options.multi_repository_set_path and self.options.repository_path:
            raise ValueError('must pass either, not both, of -r or -R')
        elif not (self.options.multi_repository_set_path or self.options.repository_path):
            raise ValueError('must pass either of -r or -R')
            
    @cmdln.option('', '--filter',
                  help='Filter repo paths by a glob filter; eg: free; free/2.?; macosx')
    @cmdln.option('', '--only-new', action="store_true", 
                  help="Only writes for packages with no 'files' or 'imports'") 
    def _do_write_files_list(self, subcmd, opts):
        """${cmd_name}: Generate .d/files and .d/imports files for all packages

        ${cmd_usage}
        ${cmd_option_list}
        """
        with self.bootstrapped():
            mreposet = MultiRepositorySet(
                self.options.multi_repository_set_path,
                self.options.configfile
            )
            
            skipped = 0
            LOG.info('Generating .d/<files,imports> for repositories in: %s', mreposet.path)
            with textui.longrun(LOG):
                for repo in mreposet:
                    if opts.filter and not fnmatch(repo.path, '*'+opts.filter+'*'):
                        skipped += 1
                        continue
                    LOG.info('')
                    LOG.info('->> {0.name:6}{0.pyver:6}{0.osarch:15}'.format(repo))
                    LOG.info('  %s', repo.path)
                    from pypm.builder.mason import _extract_files_list
                    for pypm_file in textui.ProgressBar.iterate(repo.find_packages()):
                        if opts.only_new:
                            if all([P.exists(pypm_file + '.d/imports'),
                                    P.exists(pypm_file + '.d/files')]):
                                continue
                        modules = _extract_files_list(pypm_file)
                        LOG.info('P: %s -> %s', os.path.basename(pypm_file), modules)
            if skipped:
                LOG.warn('skipped %d repositories', skipped)
                
    def do_dirty(self, subcmd, opts, *paths):
        """${cmd_name}: Mark the given paths are dirty (for uptree)

        ${cmd_usage}
        ${cmd_option_list}
        """
        with self.bootstrapped():
            mreposet = MultiRepositorySet(
                self.options.multi_repository_set_path,
                self.options.configfile
            )
            for path in paths:
                for repo in mreposet:
                    if path.startswith(repo.path + '/'):
                        cnt = repo.uptree.mark_dirty(path)
                        LOG.info('%d dirty marks set for: %s', cnt, path)
                        break
                else:
                    LOG.error('Not a path in any repository: %s', path)
                    

    @cmdln.option('-n', '--dry-run', action="store_true", default=False,
                  help="Perform a dry-run (grep for potential overwrites)")
    @cmdln.option('-f', '--force', action="store_true", default=False,
                  help="Allow overwrites (disabled by default)")
    def do_copy_custom(self, subcmd, opts, *paths):
        """${cmd_name}: Copy packages into the appropriate repository

        Use this command to *manually* copy the *custom* PyPM packages that
        won't be built by the *automated* pypm-builder/grail. This includes the
        following cases,

        1. "extra" PyPM packages available in the "GoldBits" directory of
           ActivePython (eg: as.openssl)

        2. Proprietary packages (eg: pyimsl from VNI)

        Use ``MultiRepositoryConfig`` (etc/activestate.conf) to configure how to
        allocate the custom packages, i.e., where to put them in "free" or "be"
        repo.

        Example::
        
        $ bin/pypm-repository -c etc/activestate.conf copy_custom \
          $NAS/ActivePython/2.7.0.2/GoldBits/internal/extra/*.pypm

        ${cmd_usage}
        ${cmd_option_list}
        """
        with self.bootstrapped():
            mreposet = MultiRepositorySet(
                self.options.multi_repository_set_path,
                self.options.configfile
            )
            for path in paths:
                bpkg = PackageFile(path).to_binary_package()
                repo = mreposet.get_repository(bpkg)
                target_path = P.join(
                    repo.path, 'pool', bpkg.name[0], bpkg.name[:2],
                    os.path.basename(path))
                sh.mkdirs(P.dirname(target_path))
                action = 'OVERWRITE' if P.exists(target_path) else 'CREATE'
                LOG.info('%s %s %s', action, repo.name, target_path)
                if not opts.dry_run:
                    if P.exists(target_path) and not opts.force:
                        raise IOError('cannot overwrite: %s' % target_path)
                    sh.cp(path, target_path)
                    repo.uptree.mark_dirty(target_path)
            
    @cmdln.option('', '--filters', default='free,be',
                  help='Filter repo paths by a glob filter; eg: free; free/2.?; macosx')
    def do_genidx(self, subcmd, opts):
        """${cmd_name}: Generate the index file for all available repositories

        ${cmd_usage}
        ${cmd_option_list}
        """
        with self.bootstrapped():
            if self.options.multi_repository_set_path:
                self.genidx_mreposet(opts.filters.split(','))
            else:
                self.genidx_repository()

    def genidx_mreposet(self, filters):
        logsdir = xjoin(self.options.multi_repository_set_path, '_logs')
        with log.archivedby(logging.getLogger('pypm'),
                            logsdir,
                            'repository_genidx',
                            level=logging.INFO,
                            formatter=logging.Formatter('%(asctime)s %(message)s')):
            mreposet = MultiRepositorySet(
                self.options.multi_repository_set_path,
                self.options.configfile
            )
            
            skipped = 0
            LOG.info('Generating indices for repositories in: %s', mreposet.path)
            with textui.longrun(LOG):
                for repo in mreposet:
                    if filters and not any([fnmatch(repo.path, '*'+f+'*') for f in filters]):
                        skipped += 1
                        continue
                    LOG.info('')
                    LOG.info('-> {0.name:6}{0.pyver:6}{0.osarch:15}'.format(repo))
                    LOG.info('  %s', repo.path)
                    idx = RepositoryIndex(repo)
                    idx.generate_index()
            if skipped:
                LOG.info('ALERT: skipped %d repositories', skipped)

    def genidx_repository(self):
        repo = Repository(
            self.options.repository_path, '<unnamed>', 
            pyver='<multiple>', osarch='<multiple>', url='<nourl>')
        idx = RepositoryIndex(repo)
        idx.generate_index()

def main():
    Commands(install_console=True, default_verbosity=1).main()

        

#
# -- internal --
#

def _ungzip(path):
    """Ungzip a compressed to a temporary location

    Return the extracted location
    """
    with closing(gzip.open(path, 'rb')) as f:
        with tempfile.NamedTemporaryFile(delete=False) as f2:
            shutil.copyfileobj(f, f2)
            return f2.name
