#!/usr/bin/python2 -s

"""
This script crawls the local copy of the master mirrors (which in our case
is just a nfs mount of the master mirror content). According to what it
finds, it updates the mirrormanager2 database.
It will create new product/version if it finds them and drop directories if
they disappeared.

Basically, it checks the content of the nfs mount, if the directory
contains a lot of files, it takes the 3 most recents ones, stores them in
the DB and they will be used later on to check if the mirrors are up to date.
If the directory does not contain too many files, it will register them all
and thus will check them all on the mirrors.
The threshold is stored in: `short_filelist` and is currently at 10.

If the script finds a yum or atomic repository (finds a repo data or an atomic
summary file), it will create a repository object (cf `make_repository`) which
is basically a mapping between a yum repo name (ie: Fedora-20-updates) and a
directory (/pub/fedora/linux/updates/....)

"""

import glob
import logging
import re
import optparse
import os
import stat
import sys
import yum.repoMDObject
import datetime
import time
import hashlib


sys.path.insert(0, os.path.join(os.path.dirname(
    os.path.abspath(__file__)), '..'))
import mirrormanager2.lib
from mirrormanager2.lib.model import (
    Arch, Directory, Repository, Version, FileDetail)
from mirrormanager2.lib.repomap import repo_prefix
from mirrormanager2.lib.sync import run_rsync
import mirrormanager2.lib.umdl as umdl

logger = None
stdexcludes=['.*\.snapshot', '.*/\.~tmp~']
cname = "N/A"

class MasterFilter(logging.Filter):
    def filter(self, record):
        record.category = cname
        return True

def make_file_details_from_checksums(session, diskpath, relativeDName, D):
    def _parse_checksum_file(relativeDName, checksumlen):
        r = {}
        try:
            f = open(os.path.join(diskpath, relativeDName),  'r')
            for line in f:
                line = line.strip()
                s = line.split()
                if len(s) < 2:
                    continue
                if len(s[0]) != checksumlen:
                    continue
                # strip off extraneous starting '*' char from name
                s[1] = s[1].strip('*')
                r[s[1]] = s[0]
            f.close()
        except:
            pass
        return r

    def _checksums_from_globs(relativeDName, globs, checksumlen):
        d = {}
        checksum_files = []
        for g in globs:
            checksum_files.extend(
                glob.glob(os.path.join(diskpath, relativeDName, g)))
        for f in checksum_files:
            d.update(_parse_checksum_file(f, checksumlen))
        return d

    if diskpath is None:
        return

    sha1_globs = ['*.sha1sum', 'SHA1SUM', 'sha1sum.txt']
    md5_globs = ['*.md5sum', 'MD5SUM', 'md5sum.txt']
    sha256_globs = ['*-CHECKSUM', 'sha256sum.txt']
    sha512_globs = ['*.sha512sum', 'SHA512SUM', 'sha512sum.txt']
    md5dict = _checksums_from_globs(relativeDName, md5_globs, 32)
    sha1dict = _checksums_from_globs(relativeDName, sha1_globs, 40)
    sha256dict = _checksums_from_globs(relativeDName, sha256_globs, 64)
    sha512dict = _checksums_from_globs(relativeDName, sha512_globs, 128)

    files = set()
    for k in md5dict.keys():
        files.add(k)
    for k in sha1dict.keys():
        files.add(k)
    for k in sha256dict.keys():
        files.add(k)
    for k in sha512dict.keys():
        files.add(k)

    for f in files:
        try:
            s = os.stat(os.path.join(diskpath, relativeDName, f))
        except OSError:
            # bail if the file doesn't actually exist
            continue
        sha1 = sha1dict.get(f)
        md5  = md5dict.get(f)
        sha256  = sha256dict.get(f)
        sha512  = sha512dict.get(f)
        size = s.st_size
        ctime = s[stat.ST_CTIME]
        fd = mirrormanager2.lib.get_file_detail(
            session,
            directory_id=D.id,
            filename=f,
            sha1=sha1,
            md5=md5,
            sha256=sha256,
            sha512=sha512,
            size=size,
            timestamp=ctime)
        if not fd:
            fd = FileDetail(
                directory=D,
                filename=f,
                sha1=sha1,
                md5=md5,
                sha256=sha256,
                sha512=sha512,
                timestamp=ctime,
                size=size)
            session.add(fd)
            session.commit()


def make_repo_file_details(session, diskpath, relativeDName, D, category, target):

    warning = "Won't make repo file details"

    if diskpath is None:
        logger.warning("%s: diskpath is None" % warning)
        return

    # For yum repos and ostree repos
    allowed_targets = ['repomd.xml', 'summary']
    if target not in allowed_targets:
        logger.warning("%s: %r not in %r" % (warning, target, allowed_targets))
        return

    absolutepath = os.path.join(diskpath, relativeDName, target)

    if not os.path.exists(absolutepath):
        logger.warning("%s: %r does not exist" % (warning, absolutepath))
        return

    try:
        f = open(absolutepath, 'r')
        contents = f.read()
        f.close()
    except:
        return

    size = len(contents)
    md5 = hashlib.md5(contents).hexdigest()
    sha1 = hashlib.sha1(contents).hexdigest()
    sha256 = hashlib.sha256(contents).hexdigest()
    sha512 = hashlib.sha512(contents).hexdigest()

    if target == 'repomd.xml':
        yumrepo = yum.repoMDObject.RepoMD('repoid', absolutepath)
        if 'timestamp' not in yumrepo.__dict__:
            umdl.set_repomd_timestamp(yumrepo)
        timestamp = yumrepo.timestamp
    elif target == 'summary':
        # TODO -- ostree repos may have a timestamp in their summary file
        # someday.  for now, just use the system mtime.
        timestamp = os.path.getmtime(absolutepath)

    fd = mirrormanager2.lib.get_file_detail(
        session,
        directory_id=D.id,
        filename=target,
        sha1=sha1,
        md5=md5,
        sha256=sha256,
        sha512=sha512,
        size=size,
        timestamp=timestamp)
    if not fd:
        fd = FileDetail(
            directory_id=D.id,
            filename=target,
            sha1=sha1,
            md5=md5,
            sha256=sha256,
            sha512=sha512,
            timestamp=timestamp,
            size=size)
        logger.info("Updating FileDetail %r, %r" % (fd, absolutepath))
        session.add(fd)
        session.commit()


def move_repository_from_development(prefix, arch, newdir):
    try:
        repo = Repository.selectBy(prefix=prefix, arch=arch)[0]
    except:
        return None

    if repo.directory is not None and newdir.name is not None:
        if '/development' in repo.directory.name \
                and '/releases' in newdir.name:
            repo.directory = newdir
    return repo


def make_repository(session, directory, relativeDName, category, target):

    warning = "Won't make repository object"

    # For yum repos and ostree repos
    allowed_targets = ['repomd.xml', 'summary']
    if target not in allowed_targets:
        logger.warning("%s: %r not in %r" % (warning, target, allowed_targets))
        return

    if target == 'repomd.xml':
        (ver, arch) = umdl.guess_ver_arch_from_path(session, category, relativeDName)
        if ver is None or arch is None:
            logger.warning("%s: could not guess version and arch %r, %r" % (
                warning, ver, arch))
            return None
    elif target == 'summary':
        # For ostree, we someday need to actually extract the arch information
        # from the ostree repo, but for now (F21 and F22) we will only be
        # shipping x86_64, so we hardcode that.  At present, it is not possible
        # to query an ostree repo for the arch information.  Bug walters about
        # this.
        arch = mirrormanager2.lib.get_arch_by_name(session, 'x86_64')
        # Furthermore, we'll grab the version piece from the path which looks
        # like atomic/rawhide or atomic/21.
        ver = relativeDName.rstrip('/').split('/')[-1]
        ver = mirrormanager2.lib.get_version_by_name_version(
            session, category.product.name, ver)
        if ver is None:
            if not relativeDName.endswith('/'):
                relativeDName += '/'
            ver = umdl.create_version_from_path(category, relativeDName)
            session.add(ver)
            session.commit()
            umdl.version_cache.append(ver)

    # stop making duplicate Repository objects.
    if len(directory.repositories) > 0:
        logger.warning("%s: directory already has a repository" %
            (directory.name))
        return None

    repo = None
    prefix = repo_prefix(relativeDName, category, ver)
    repo = mirrormanager2.lib.get_repo_prefix_arch(
        session, prefix, arch.name)

    if not repo:
        # historically, Repository.name was a longer string with
        # product and category deliniations.  But we were getting
        # unique constraint conflicts once we started introducing
        # repositories under repositories.  And .name isn't used for
        # anything meaningful.  So simply have it match dir.name,
        # which can't conflict.
        repo = Repository(
            name=directory.name,
            category=category,
            version=ver,
            arch=arch,
            directory=directory,
            prefix=prefix)
        logger.info(
            'Created Repository(prefix=%s, version=%s, arch=%s, '
            'category=%s) -> Directory %s'
            % (prefix, ver.name, arch.name, category.name, directory.name))
        session.add(repo)
        session.flush()
    else:
        if repo.prefix != prefix:
            repo.prefix = prefix

    return repo


def is_excluded(path, excludes):
    for e in excludes:
        if re.compile(e).match(path):
            return True
    return False


def nuke_gone_directories(session, diskpath, category):
    """ deleting a Directory has a ripple effect through the whole
        database.  Be really sure you're ready do to this.  It comes
        in handy when say a Test release is dropped."""

    topdirName = category.topdir.name

    directories = category.directories # in ascending name order
    directories.reverse() # now in decending name order, bottoms up
    for d in directories:
        if is_excluded(d.name, category.excludes): continue
        relativeDName = umdl.remove_category_topdir(topdirName, d.name)
        if not os.path.isdir(os.path.join(diskpath, relativeDName)):
            if len(d.categories) == 1: # safety, this should always trigger
                logger.info("Deleting gone directory %s" % (d.name))
                session.delete(d)
                session.commit()


def ctime_from_rsync(date, hms):
    year, month, day = date.split('/')
    hour, minute, second = hms.split(':')
    t = datetime.datetime(
        int(year), int(month), int(day),
        int(hour), int(minute), int(second), 0, None)
    return int(time.mktime(t.timetuple()))


def fill_category_directories_from_rsync(
        line, category, topdirName, category_directories):
    readable = True
    relativeDName = line.split()[4]
    if re.compile('^\.$').match(d):
        directoryname = topdirName
    else:
        directoryname = os.path.join(topdirName, relativeDName)

    if is_excluded(directoryname, stdexcludes + category.excludes): return

    perms = line.split()[0]
    if not re.compile('^d......r.x').match(perms) \
            or umdl.parent_dir(relativeDName) in category.unreadable_dirs:
        readable=False
        category.unreadable_dirs.add(relativeDName)

    try:
        perm, size, date, hms, filepath = line.split()
    except ValueError:
        raise
    ctime = ctime_from_rsync(date, hms)

    category_directories[relativeDName] = {
        'files':{},
        'isRepository':False,
        'isAtomic':False,
        'readable':readable,
        'ctime':ctime,
        'changed':True}
    if d.endswith('repodata'):
        parent_relativeDName = umdl.parent_dir(relativeDName)
        try:
            category_directories[parent_relativeDName]['isRepository'] = True
        except KeyError:
            category_directories[parent_relativeDName] = {
                'files':{},
                'isRepository':True,
                'readable':readable,
                'ctime':ctime,
                'changed':True}

    return category_directories


def add_file_to_directory(line, category_directories):
    try:
        perm, size, date, hms, filepath = line.split()
    except ValueError:
        return
    try:
        dt = ctime_from_rsync(date, hms)
    except ValueError:
        return

    l = filepath.split('/')
    filename = l[-1]
    subpath = l[:-1]
    if len(subpath) > 0:
        relativeDName = os.path.join(*subpath)
    else:
        relativeDName = ''
    category_directories[relativeDName]['files'][filename] = {
        'size':size, 'stat':dt}


def short_filelist(files):
    html=0
    rpms=0
    hdrs=0
    drpms=0
    for f in files.keys():
        if f.endswith('.html'):  html += 1
        if f.endswith('.rpm'):   rpms += 1
        if f.endswith('.hdr'):   hdrs += 1
        if f.endswith('.drpm'): drpms += 1
    if html>10 or rpms > 10 or hdrs > 10 or drpms > 10:
        date_file_list = []
        rc = {}
        for k in files.keys():
            date_file_tuple = (files[k]['stat'], k, files[k]['size'])
            date_file_list.append(date_file_tuple)
        date_file_list.sort()
        # keep the most recent 3
        date_file_list = date_file_list[-3:]

        for _, k, _ in date_file_list:
            rc[k] = files[k]
        return rc
    else:
        return files


def sync_category_directories(
        session, config, diskpath, category, category_directories):

    logger.debug("  sync_directories_directories %r" % category)

    for relativeDName in sorted(category_directories.keys()):
        value = category_directories[relativeDName]
        set_readable = False
        set_ctime = False
        set_files = False

        if relativeDName in category.directory_cache:
            d = category.directory_cache[relativeDName]
            if d['readable'] != value['readable']:
                set_readable = True
            if d['ctime'] != value['ctime']:
                set_ctime = True
            D = mirrormanager2.lib.get_directory_by_id(session, d.id)
        else:
            if relativeDName == u'':
                D = category.topdir
            else:
                # Can't find the new directory, just add it
                dname = os.path.join(category.topdir.name, relativeDName)
                D = Directory(
                    name=dname,
                    readable=value['readable'],
                    ctime=value['ctime'])
                logger.debug(
                    u'Created Directory(%s, readable=%s, ctime=%s)'
                    % (dname, value['readable'], value['ctime']))
            # Add this category to the directory
            D.categories.append(category)
            session.add(D)
            # And flush so that we can already start using it
            session.flush()
            # Refresh the cache
            category.directory_cache = cache_directories(category)
            d = category.directory_cache[relativeDName]

        if value['changed']:
            set_files = True

        if (set_readable or set_ctime or set_files):
            if set_readable:
                D.readable = value['readable']
            if set_ctime:
                D.ctime = value['ctime']
            if set_files:
                if D.files != short_filelist(value['files']):
                    D.files = short_filelist(value['files'])
        session.add(D)
        session.commit()
        make_file_details_from_checksums(session, diskpath, relativeDName, D)

    # this has to be a second pass to be sure the child repodata/ dir is
    # created in the db first
    for relativeDName, value in category_directories.iteritems():
        d = category.directory_cache[relativeDName]
        D = mirrormanager2.lib.get_directory_by_id(session, d.id)

        if value['isRepository']:
            target = 'repomd.xml'
        elif value['isAtomic']:
            target = 'summary'
        else:
            target = None

        if value['isRepository'] or value['isAtomic']:
            make_repository(session, D, relativeDName, category, target)

        if 'repomd.xml' in value['files']:
            target = 'repomd.xml'
        elif 'summary' in value['files']:
            target = 'summary'
        else:
            continue

        make_repo_file_details(
            session, diskpath, relativeDName, D, category, target)

    Directory.age_file_details(session, config)


def parse_rsync_listing(session, config, category, f):
    topdirName = category.topdir.name
    category_directories = {}
    category.unreadable_dirs = set()
    while True:
        line = f.readline()
        if not line: break
        line.strip()
        l = line.split()
        if line.startswith('d') and len(l) == 5 and len(l[0]) == 10:
            # good guess it's a directory line
            if re.compile('^\.$').match(line):
                # we know category.topdir exists and isn't excluded
                pass
            else:
                category_directories = fill_category_directories_from_rsync(
                    line, category, topdirName, category_directories)
        else:
            add_file_to_directory(line, category_directories)

    sync_category_directories(
        session, config, None, category, category_directories)


def sync_directories_using_rsync(session, config, rsyncpath, category):
    try:
        result, output = run_rsync(rsyncpath, category.extra_rsync_options, logger)
    except:
        logger.warning('Failed to run rsync.', exc_info = True)
        return
    if result > 0:
        logger.info(
            "rsync returned exit code %d for Category %s: %s"
            % (result, category.name, output))
    # still, try to use the output listing if we can
    parse_rsync_listing(session, config, category, output)


def sync_directories_from_file(session, config, filename, category):
    f = open(filename, 'r')
    parse_rsync_listing(session, config, None, category, f)
    f.close()


def cache_directories(category):
    cache = dict()
    topdirName = category.topdir.name
    for directory in list(category.directories):
        relativeDName = umdl.remove_category_topdir(
            topdirName, directory.name).strip(u'/')
        cache[relativeDName] = directory
    return cache


def sync_directories_from_disk(
    session, config, diskpath, category, excludes=None):

    excludes = excludes or []
    logger.debug("sync_directories_from_disk %r" % diskpath)
    category.unreadable_dirs = set()
    # drop any trailing slashes from diskpath
    diskpath = diskpath.rstrip(u'/')
    category_directories = {}

    for dirpath, dirnames, filenames in os.walk(diskpath, topdown=True):
        relativeDName = dirpath[len(diskpath) + 1:]
        relativeDName = relativeDName.strip(u'/')
        logger.debug("  walking %r" % relativeDName)
        if is_excluded(relativeDName, stdexcludes + excludes):
            logger.info("excluding %s" % (relativeDName))
            # exclude all its subdirs too
            dirnames[:] = []
            continue


        # avoid disappearing directories
        try:
            s = os.stat(os.path.join(diskpath, relativeDName))
            ctime = s[stat.ST_CTIME]
        except OSError:
            logger.debug("Avoiding %r, dissappeared." % relativeDName)
            continue

        try:
            d_ctime = category.directory_cache[relativeDName]['ctime']
        except KeyError:
            # we'll need to create it
            d_ctime = 0

        dirnames.sort()

        mode = s.st_mode
        readable = not not (mode & stat.S_IRWXO & (stat.S_IROTH|stat.S_IXOTH))
        if not readable or umdl.parent_dir(relativeDName) in category.unreadable_dirs:
            category.unreadable_dirs.add(relativeDName)
        isRepo = u'repodata' in dirnames
        isAtomic = u'summary' in filenames and u'objects' in dirnames

        changed = (d_ctime != ctime)
        if changed:
            logger.info("%s has changed: %s != %s" % (
                relativeDName, d_ctime, ctime))
        else:
            logger.debug("    %s has not changed" % relativeDName)

        category_directories[relativeDName] = {
            'files':{},
            'isRepository':isRepo,
            'isAtomic':isAtomic,
            'readable':readable,
            'ctime':ctime,
            'changed':changed}

        # skip per-file stat()s if the directory hasn't changed
        if changed:
            for f in filenames:
                try:
                    s = os.stat(os.path.join(diskpath, relativeDName, f))
                except OSError:
                    continue
                category_directories[relativeDName]['files'][f] = {
                    'size':str(s.st_size),
                    'stat':s[stat.ST_CTIME]}

    sync_category_directories(
        session, config, diskpath, category, category_directories)

def setup_logging(config, options):
    global logger
    log_dir = config.get('MM_LOG_DIR', None)
    # check if the directory exists
    if log_dir is not None:
        if not os.path.isdir(log_dir):
            # MM_LOG_DIR seems to be configured but does not exist
            # Logging into cwd.
            logger.warning("Directory " + log_dir + " does not exists."
                           " Logging into CWD.")
            log_dir = None

    if log_dir is not None:
        log_file = log_dir + "/" + options.logfile
    else:
        log_file = options.logfile

    fmt = '%(asctime)s:%(category)s:%(message)s'
    formatter = logging.Formatter(fmt=fmt)
    logger = logging.getLogger('umdl')
    handler = logging.handlers.WatchedFileHandler(log_file, "a+b")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    f = MasterFilter()
    logger.addFilter(f)
    # list_categories is a special case where the user wants to see something
    # on the console and not only in the log file
    if options.debug or options.list_categories:
        sh = logging.StreamHandler()
        sh.setFormatter(formatter)
        logger.addHandler(sh)
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)


def main():
    global logger
    global cname
    parser = optparse.OptionParser(usage=sys.argv[0] + " [options]")
    parser.add_option(
        "-c", "--config",
        dest="config", default='/etc/mirrormanager/mirrormanager2.cfg',
        help="Configuration file to use")
    parser.add_option(
        "--logfile",
        dest="logfile", default='umdl.log',
        metavar="FILE", help="write logs to FILE")
    parser.add_option(
        "--list",
        dest="list_categories", default=False, action="store_true",
        help="list existing categories and exit")
    parser.add_option(
        "--category", metavar="CATEGORY",
        dest="categories", default=None,
        help="only scan category CATEGORY")
    parser.add_option(
        "--debug",
        dest="debug", default=False, action="store_true",
        help="enable debugging")
    parser.add_option(
        "--delete-directories",
        dest="delete_directories",
        default=False,
        action="store_true",
        help="delete directories from the database that are no longer "
            "on disk")

    (options, args) = parser.parse_args()
    config = dict()
    with open(options.config) as config_file:
        exec(compile(config_file.read(), options.config, 'exec'), config)

    session = mirrormanager2.lib.create_session(config['DB_URL'])

    setup_logging(config, options)

    logger.info("Starting umdl")

    if options.list_categories:
        categories = mirrormanager2.lib.get_categories(session)
        for c in categories:
            logger.info(c)
        session.close()
        logger.info("Ending umdl")
        return 0

    umdl.setup_arch_version_cache(session)
    check_categories = []
    if options.categories is None:
        check_categories = config.get('umdl_master_directories')
    else:
        for i in config.get('umdl_master_directories'):
            if i['category'] == options.categories:
                check_categories.append(i)

    for i in check_categories:
        cname = i['category']

        category = mirrormanager2.lib.get_category_by_name(session, cname)
        if not category:
            logger.error(
                'umdl_master_directories Category %s does not exist in the '
                'database, skipping' % (cname))
            continue

        if category.product is None:
            logger.error(
                'umdl_master_directories Category %s has null Product, '
                'skipping' % (cname))
            continue

        category.excludes = i.get('excludes', [])
        category.extra_rsync_options = i.get('options')
        category.directory_cache = cache_directories(category)

        if i['type'] == 'rsync':
            sync_directories_using_rsync(session, config, i['url'], category)

        if i['type'] == 'file':
            sync_directories_from_file(session, config, i['url'], category)

        if i['type'] == 'directory':
            sync_directories_from_disk(session, config, i['path'], category)

            if options.delete_directories:
                nuke_gone_directories(session, i['path'], category)

    logger.info("Ending umdl")

    return 0


if __name__ == "__main__":
    sys.exit(main())
