Current File : //etc/pkg/sysrepo/sysrepo_p5p.py
#!/usr/bin/python2.7
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#

# Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.

import pkg.p5p

import httplib
import os
import shutil
import simplejson
import sys
import threading
import traceback

# redirecting stdout for proper WSGI portability
sys.stdout = sys.stderr

SERVER_OK_STATUS = "%s %s" % (httplib.OK, httplib.responses[httplib.OK])
SERVER_ERROR_STATUS = "%s %s" % (httplib.INTERNAL_SERVER_ERROR,
    httplib.responses[httplib.INTERNAL_SERVER_ERROR])
SERVER_NOTFOUND_STATUS = "%s %s" % (httplib.NOT_FOUND,
    httplib.responses[httplib.NOT_FOUND])
SERVER_BADREQUEST_STATUS = "%s %s" % (httplib.BAD_REQUEST,
    httplib.responses[httplib.BAD_REQUEST])

response_headers = [("content-type", "application/binary")]

p5p_indices = {}

# A lock to prevent two threads from rebuilding our catalog parts cache
# at the same time.
p5p_update_lock = threading.Lock()

class UnknownPathException(Exception):
        """An exception thrown when a client requests a path within a p5p file
        which does not exist."""
        def __init__(self, path):
                self.path = path

        def __str__(self):
                return "Unknown path: %s" % self.path


class MalformedQueryException(Exception):
        """An exception thrown when this wsgi application cannot parse a query
        from the client."""
        def __init__(self, query, reason):
                self.query = query
                self.reason = reason

        def __str__(self):
                return "Malformed query %s: %s" % (self.query, self.reason)


class MissingArchiveException(Exception):
        """An exception thrown when the p5p file referred to by the
        configuration does not exist."""
        def __init__(self, path):
                self.path = path

        def __str__(self):
                return "Missing p5p archive: %s" % (self.path)


class SysrepoP5p(object):
        """An object to handle a request for p5p file contents from the
        system repository."""

        def __init__(self, environ, start_response):
                self.environ = environ
                self.start_response = start_response
                self.p5p_path = None
                self.p5p = None

                self.query = self.environ["QUERY_STRING"]
                self.runtime_dir = self.environ["SYSREPO_RUNTIME_DIR"]

        def close(self):
                """Release any resources we have used."""
                if self.p5p:
                        self.p5p.close()

        def log_exception(self, status=SERVER_ERROR_STATUS):
                """Print some information in the Apache log that will help
                determine what went wrong as well as updating the client
                response code.  The WSGI spec says we can call
                start_response multiple times, but must include exc_info
                if we do so."""

                # we only want error_log output if our status is not 4xx
                if status != SERVER_NOTFOUND_STATUS and \
                    status != SERVER_BADREQUEST_STATUS:
                        print traceback.format_exc()
                self.start_response(status, response_headers,
                    sys.exc_info())

        def need_update(self, pub, hsh):
                """Determine if we need to update our cached catalog and
                reload the index by comparing the last modification time of a
                file we create per p5p archive, and the p5p archive itself."""

                htdocs_path = os.path.join(self.runtime_dir, "htdocs")
                timestamp_path = \
                    "%(htdocs_path)s/%(pub)s/%(hsh)s/sysrepo.timestamp" % \
                    locals()

                update = False

                # Locking here is quite basic: we want to ensure that no two
                # threads simultaneously decide that they need to rebuild our
                # local catalog cache, stepping on each others toes.  It is
                # possible that while processing a single query, a user will
                # replace the p5p file on the server after this method has been
                # called, causing stale data to be returned at best, and a HTTP
                # 500 response at worst (as the p5p index used by this web
                # application will not match the one in the new archive)
                p5p_update_lock.acquire()
                try:
                        # don't write a timestamp if we're testing
                        if self.environ.get("PKG5_TEST_ENV") == "True":
                                return True

                        try:
                                st_p5p = os.stat(self.p5p_path)
                        except OSError, e:
                                if e.errno == os.errno.ENOENT:
                                        raise MissingArchiveException(
                                            self.p5p_path)
                        try:
                                st_ts = os.stat(timestamp_path)
                                if st_ts.st_mtime < st_p5p.st_mtime:
                                        open(timestamp_path, "wb").close()
                                        update = True
                        except OSError, e:
                                if e.errno == os.errno.ENOENT:
                                        open(timestamp_path, "wb").close()
                                        update = True

                except MissingArchiveException, e:
                        raise
                except Exception, e:
                        self.log_exception()
                finally:
                        p5p_update_lock.release()
                return update

        def _file_response(self, path, pub):
                """Process our file query."""

                # use the basename of the path, which is the pkg(5) hash
                self.start_response(SERVER_OK_STATUS, response_headers)
                try:
                        return self.p5p.get_package_file(os.path.basename(path),
                            pub=pub)
                except pkg.p5p.UnknownArchiveFiles, e:
                        self.log_exception(status=SERVER_NOTFOUND_STATUS)
                except Exception, e:
                        self.log_exception()

        def _catalog_response(self, path, pub, hsh):
                """Process our catalog query"""

                cat_part = os.path.basename(path)
                htdocs_path = os.path.join(self.runtime_dir, "htdocs")
                cat_path = \
                    "%(htdocs_path)s/%(pub)s/%(hsh)s/catalog/1/%(cat_part)s" % \
                    locals()
                self.start_response(SERVER_OK_STATUS, response_headers)
                if os.path.exists(cat_path):
                        return open(cat_path, "rb")

                # this is unlikely to happen: it implies a catalog part has been
                # requested that wasn't listed in the catalog.attrs file
                # extracted during _precache_catalog() or the file has been
                # removed on the server.  Do our best to return the content.
                try:
                        cat_dir = os.path.dirname(cat_path)
                        p5p_update_lock.acquire()
                        try:
                                if not os.path.exists(cat_dir):
                                        os.makedirs(cat_dir, 0755)
                                self.p5p.extract_catalog1(cat_part, cat_dir,
                                    pub=pub)
                                return open(cat_path, "rb")
                        except (pkg.p5p.UnknownArchiveFiles, IOError), e:
                                self.log_exception(
                                    status=SERVER_NOTFOUND_STATUS)
                        except Exception, e:
                                self.log_exception()
                        finally:
                                p5p_update_lock.release()
                except OSError, e:
                        if e.errno == os.errno.ENOENT:
                                return open(cat_path, "rb")
                        else:
                                raise

        def _manifest_response(self, path, pub):
                """Return our manifest_response. """

                pkg_name = path.replace("manifest/0/", "")
                fmri = "pkg://%s/%s" % (pub, pkg_name)
                mf = None
                self.start_response(SERVER_OK_STATUS, response_headers)
                try:
                        mf = self.p5p.get_package_manifest(fmri, raw=True)
                        return mf
                except pkg.p5p.UnknownPackageManifest, e:
                        self.log_exception(status=SERVER_NOTFOUND_STATUS)
                except pkg.fmri.IllegalFmri, e:
                        self.log_exception(status=SERVER_NOTFOUND_STATUS)
                except Exception, e:
                        self.log_exception()

        def _precache_catalog(self, pub, hsh):
                """Extract the parts from the catalog_dir to the given path."""

                htdocs_path = os.path.join(self.runtime_dir, "htdocs")
                cat_dir = "%(htdocs_path)s/%(pub)s/%(hsh)s/catalog/1" % \
                    locals()

                if os.path.exists(cat_dir):
                        shutil.rmtree(cat_dir)

                os.makedirs(cat_dir)
                try:
                        self.p5p.extract_catalog1("catalog.attrs", cat_dir,
                            pub=pub)
                        with open(os.path.join(cat_dir, "catalog.attrs"),
                            "rb") as catalog_attrs:
                                json = simplejson.load(catalog_attrs)
                                for part in json["parts"]:
                                        self.p5p.extract_catalog1(part, cat_dir,
                                            pub=pub)

                except pkg.p5p.UnknownArchiveFiles, e:
                        # if the catalog part is unavailable,
                        # we ignore this for now.  It will be
                        # reported later anyway.
                        pass

        def _parse_query(self):
                """Parse our query, returning publisher, hash, and path
                values."""

                keyvals = self.query.split("&")
                attrs = {}
                for keyval in keyvals:
                        try:
                                key, val = keyval.split("=", 1)
                                attrs[key] = val
                        except ValueError:
                                raise MalformedQueryException(self.query,
                                    "missing key=value pair for %s." % keyval)

                pub = attrs.get("pub")
                hsh = attrs.get("hash")
                path = attrs.get("path")

                if not hsh:
                        raise MalformedQueryException(self.query,
                            "missing hash.")
                if hsh not in self.environ:
                        raise MalformedQueryException(self.query,
                            "unknown hash %s." % hsh)
                if not pub:
                        raise MalformedQueryException(self.query,
                            "missing publisher.")
                if not path:
                        raise MalformedQueryException(self.query,
                            "missing path.")
                return pub, hsh, path

        def execute(self):
                """Process a query of the form:

                pub=<publisher>&hash=<hash>&path=<path>

                where:
                    <publisher>    the name of the publisher from the p5p file
                    <hash>         the sha1 hash of the location of the p5p file
                    <path>         the path of the pkg(5) client request

                In the environment of this WSGI application, apart from the
                default WSGI values, defined in PEP333, we expect:

                "SYSREPO_RUNTIME_DIR", a location pointing to the runtime
                directory, allowing us to serve static html from beneath a
                "htdocs" subdir.

                <hash>, which maps the sha1 hash of the p5p archive path, to the
                path itself, which is not visible to clients.
                """

                buf = []
                try:
                        pub, hsh, path = self._parse_query()
                        self.p5p_path = self.environ[hsh]
                        # In order to keep only one copy of the p5p index in
                        # memory, we cache it locally, and reuse it any time
                        # we're opening the same p5p file.  Before doing
                        # so, we need to ensure the p5p file hasn't been
                        # modified since we last looked at it.
                        if self.need_update(pub, hsh) or \
                            self.p5p_path not in p5p_indices:
                                p5p_update_lock.acquire()
                                try:
                                        self.p5p = pkg.p5p.Archive(
                                            self.p5p_path)
                                        p5p_indices[self.p5p_path] = \
                                            self.p5p.get_index()
                                        self._precache_catalog(pub, hsh)
                                except:
                                        raise
                                finally:
                                        p5p_update_lock.release()
                        else:
                                self.p5p = pkg.p5p.Archive(self.p5p_path,
                                    archive_index=p5p_indices[self.p5p_path])

                        if path.startswith("file"):
                                buf = self._file_response(path, pub)
                        elif path.startswith("catalog/1/"):
                                buf = self._catalog_response(path, pub, hsh)
                        elif path.startswith("manifest/0"):
                                buf = self._manifest_response(path, pub)
                        else:
                                raise UnknownPathException(path)
                except OSError, e:
                        print e.errno
                        if e.errno == os.errno.ENOENT:
                                self.log_Exception(
                                    status=SERVER_NOTFOUND_STATUS)
                except UnknownPathException, e:
                        self.log_exception(status=SERVER_NOTFOUND_STATUS)
                except MalformedQueryException, e:
                        self.log_exception(status=SERVER_BADREQUEST_STATUS)
                except MissingArchiveException, e:
                        self.log_exception()
                except Exception, e:
                        self.log_exception()
                return buf


#
# CloseGenerator,  AppWrapper and _application as an idiom together
# are described at
# http://code.google.com/p/modwsgi/wiki/RegisteringCleanupCode
# and exist to ensure that we close any server-side resources used by
# our application at the end of the request (i.e. after the client has
# received it)
#

def _application(environ, start_response):
        sysrepo = SysrepoP5p(environ, start_response)
        result = sysrepo.execute()
        return result, sysrepo


class CloseGenerator(object):
        """A wrapper class to ensure we have a close() method on the iterable
        returned from the mod_wsgi application, see PEP333."""

        def __init__(self, iterable, closeable):
                self.__iterable = iterable
                self.__closeable = closeable

        def __iter__(self):
                # if we haven't produced an iterable, that's
                # likely because of an exception. Do nothing.
                if not self.__iterable:
                        return
                for item in self.__iterable:
                        yield item

        def close(self):
                try:
                        if hasattr(self.__iterable, "close"):
                                self.__iterable.close()
                finally:
                        self.__closeable.close()


class AppWrapper(object):
        """Wrap a callable application with this class in order for its results
        to be handled by CloseGenerator when that callable is called."""

        def __init__(self, application):
                self.__application = application

        def __call__(self, environ, start_response):
                result, closeable = self.__application(environ, start_response)
                return CloseGenerator(result, closeable)


application = AppWrapper(_application)

if __name__ == "__main__":
        """A simple main function to allows us to test any given query/env"""
        import urllib

        def start_response(status, response_headers, exc_info=None):
                """A dummy response function."""
                print "responding with %s" % status
                if exc_info:
                        print traceback.format_exc(exc_info)

        if len(sys.argv) != 3:
                query = \
                ("'pub=test&hash=de5acae11333890c457665379eec812a67f78dd3"
                "&path=manifest/0/mypackage@1.2.9%2C5.11-1%3A20110617T204846Z'")
                alias = \
                "de5acae11333890c457665379eec812a67f78dd3=/tmp/archive.p5p"
                print "usage: sysrepo_p5p <query> <hash>=<path to p5p file>"
                print "eg: ./sysrepo_p5p.py %s %s" % (query, alias)
                sys.exit(2)

        environ = {}

        # unquote the url, so that we can easily copy/paste entries from
        # Apache logs when testing.
        environ["QUERY_STRING"] = urllib.unquote(sys.argv[1])
        environ["SYSREPO_RUNTIME_DIR"] = os.environ["PWD"]
        environ["PKG5_TEST_ENV"] = "True"
        hsh, path = sys.argv[2].split("=")
        environ[hsh] = path

        for response in application(environ, start_response):
                if isinstance(response, basestring):
                        print response.rstrip()
                elif response:
                        for line in response.readlines():
                                print line.rstrip()