Current File : /home/jvzmxxx/wiki1/extensions/EventLogging/server/eventlogging/handlers.py
# -*- coding: utf-8 -*-
"""
  eventlogging.handlers
  ~~~~~~~~~~~~~~~~~~~~~

  This class contains the set of event readers and event writers that ship with
  EventLogging. Event readers are generators that yield successive events from
  a stream. Event writers are coroutines that receive events and handle them
  somehow. Both readers and writers are designed to be configurable using URIs.
  :func:`eventlogging.drive` pumps data through a reader-writer pair.

"""
import collections
import datetime
import glob
import imp
import inspect
import json

from functools import partial
from kafka import KafkaClient
from kafka import KeyedProducer
from kafka import SimpleProducer
from kafka.producer.base import Producer
from kafka.common import KafkaTimeoutError
from pykafka import KafkaClient as PyKafkaClient
from pykafka import BalancedConsumer

import logging
import logging.handlers
import os
import re
import socket
import sys
import sqlalchemy
import statsd
import time
import traceback
import uuid

from .compat import items
from .utils import PeriodicThread, uri_delete_query_item
from .factory import writes, reads
from .streams import stream, pub_socket, sub_socket, udp_socket
from .jrm import store_sql_events, DB_FLUSH_INTERVAL

__all__ = ('load_plugins',)

# EventLogging will attempt to load the configuration file specified in the
# 'EVENTLOGGING_PLUGIN_DIR' environment variable if it is defined. If it is
# not defined, EventLogging will default to the value specified below.
DEFAULT_PLUGIN_DIR = '/usr/local/lib/eventlogging'


def load_plugins(path=None):
    """Load EventLogging plug-ins from `path`. Plug-in module names are mangled
    to prevent clobbering modules in the Python module search path."""
    if path is None:
        path = os.environ.get('EVENTLOGGING_PLUGIN_DIR', DEFAULT_PLUGIN_DIR)
    for plugin in glob.glob(os.path.join(path, '*.py')):
        imp.load_source('__eventlogging_plugin_%x__' % hash(plugin), plugin)


#
# Writers
#

@writes('mongodb')
def mongodb_writer(uri, database='events'):
    import pymongo

    client = pymongo.MongoClient(uri)
    db = client[database]
    datetime_from_timestamp = datetime.datetime.fromtimestamp

    while 1:
        event = (yield)
        event['timestamp'] = datetime_from_timestamp(event['timestamp'])
        event['_id'] = event['uuid']
        collection = event['schema']
        db[collection].insert(event)


@writes('kafka')
def kafka_writer(
    path,
    producer='simple',
    topic='eventlogging_%(schema)s',
    key='%(schema)s_%(revision)s',
    blacklist=None,
    raw=False,
    **kafka_producer_args
):
    """
    Write events to Kafka.

    Kafka URIs look like:
    kafka:///b1:9092,b2:9092?topic=eventlogging_%s(schema)&async=True&...

    This producer uses either SimpleProducer or KeyedProducer from
    kafka-python.  You may pass any configs that base Producer takes
    as keyword arguments via URI query params.

    NOTE:  If you do not explicitly set it, async will default to True.

        path      - URI path should be comma separated Kafka Brokers.
                    e.g. kafka01:9092,kafka02:9092,kafka03:9092

        producer  - Either 'keyed' or 'simple'.  Default: 'simple'.

        topic     - Python format string topic name.
                    If the incoming event is a dict (not a raw string)
                    topic will be interpolated against event.  I.e.
                    topic % event.  Default: eventlogging_%(schema)s

        key       - Python format string key of the event message in Kafka.
                    If the incoming event is a dict (not a raw string)
                    key will be interpolated against event.  I.e.
                    key % event.  Default: %(schema)s_%(revision)s.
                    This is ignored if you are using the simple producer.

        blacklist - Pattern string matching a list of schemas that should not
                    be written. This is useful to keep high volume schemas
                    from being written to an output stream.  This will
                    be ignored if the incoming events are raw.

        raw       - Should the events be written as raw (encoded) or not?
    """

    # Brokers should be in the uri path
    brokers = path.strip('/')

    # remove non Kafka Producer args from kafka_consumer_args
    kafka_producer_args = {
        k: v for k, v in items(kafka_producer_args)
        if k in inspect.getargspec(Producer.__init__).args
    }

    # Use async producer by default
    if 'async' not in kafka_producer_args:
        kafka_producer_args['async'] = True

    kafka = KafkaClient(brokers)

    if producer == 'keyed':
        ProducerClass = KeyedProducer
    else:
        ProducerClass = SimpleProducer

    kafka_producer = ProducerClass(kafka, **kafka_producer_args)

    # These will be used if incoming events are not interpolatable.
    default_topic = topic.encode('utf8')
    default_key = key.encode('utf8')

    kafka_topic_create_timeout_seconds = 0.1

    if blacklist:
        blacklist_pattern = re.compile(blacklist)
    else:
        blacklist_pattern = None

    while 1:
        event = (yield)

        # If event is a dict (not Raw) then we can interpolate topic and key
        # as format strings.
        # E.g. message_topic = 'eventlogging_%(schema)s' % event.
        # WARNING!  Be sure that your topic and key strings don't try
        # to interpolate out a field in event that doesn't exist!
        if isinstance(event, dict):
            if blacklist_pattern and blacklist_pattern.match(event['schema']):
                logging.debug(
                    '%s is blacklisted, not writing event %s.' %
                    (event['schema'], event['uuid'])
                )
                continue

            message_topic = (topic % event).encode('utf8')
            if producer == 'keyed':
                message_key = (key % event).encode('utf8')
        else:
            message_topic = default_topic
            message_key = default_key

        try:
            # Make sure this topic exists before we attempt to produce to it.
            # This call will timeout in kafka_topic_create_timeout_seconds.
            # This should return faster than this if this kafka client has
            # already cached topic metadata for this topic.  Otherwise
            # it will try to ask Kafka for it each time.  Make sure
            # auto.create.topics.enabled is true for your Kafka cluster!
            kafka.ensure_topic_exists(
                message_topic,
                kafka_topic_create_timeout_seconds
            )
        except KafkaTimeoutError:
            error_message = "Failed to ensure Kafka topic %s exists " \
                "in %f seconds when producing event" % (
                    message_topic,
                    kafka_topic_create_timeout_seconds
                )
            if isinstance(event, dict):
                error_message += " of schema %s revision %d" % (
                    event['schema'],
                    event['revision']
                )
            error_message += ". Skipping event. " \
                "(This might be ok if this is a new topic.)"
            logging.warn(error_message)
            continue

        if raw:
            value = event.encode('utf-8')
        else:
            value = json.dumps(event, sort_keys=True)

        # send_messages() for the different producer types have different
        # signatures.  Call it appropriately.
        if producer == 'keyed':
            kafka_producer.send_messages(message_topic, message_key, value)
        else:
            kafka_producer.send_messages(message_topic, value)


def insert_stats(stats, inserted_count):
    """
    Callback function to increment mysql inserted metric in statsd,
    that is called after successful insertion of events into mysql.

        stats           - Instance of stats.StatsClient
        inserted_count  - Number of events that have been inserted
    """
    if stats:
        stats.incr('overall.inserted', inserted_count)


@writes('mysql', 'sqlite')
def sql_writer(uri, replace=False, statsd_host=''):
    """Writes to an RDBMS, creating tables for SCIDs and rows for events."""
    # Don't pass 'replace' and 'statsd_host' parameter to SQLAlchemy.
    uri = uri_delete_query_item(uri, 'replace')
    uri = uri_delete_query_item(uri, 'statsd_host')

    logger = logging.getLogger('Log')

    # Create a statsd client instance if statsd_host is specified
    stats = None
    if statsd_host:
        stats = statsd.StatsClient(statsd_host, 8125, prefix='eventlogging')

    meta = sqlalchemy.MetaData(bind=uri)
    # Each scid stores a buffer and the timestamp of the first insertion.
    events = collections.defaultdict(lambda: ([], time.time()))
    events_batch = collections.deque()
    # Since the worker is unaware of the statsd host, create a partial
    # that binds the statsd client argument to the callback
    worker = PeriodicThread(interval=DB_FLUSH_INTERVAL,
                            target=store_sql_events,
                            args=(meta, events_batch),
                            kwargs={'replace': replace,
                                    'on_insert_callback':
                                        partial(insert_stats, stats)})
    worker.start()

    if meta.bind.dialect.name == 'mysql':
        @sqlalchemy.event.listens_for(sqlalchemy.pool.Pool, 'checkout')
        def ping(dbapi_connection, connection_record, connection_proxy):
            # Just before executing an insert, call mysql_ping() to verify
            # that the connection is alive, and reconnect if necessary.
            dbapi_connection.ping(True)
    try:
        batch_size = 5000
        batch_time = 300  # in seconds
        # Max number of batches pending insertion.
        queue_size = 1000
        sleep_seconds = 5
        # Link the main thread to the worker thread so we
        # don't keep filling the queue if the worker died.
        while worker.is_alive():
            # If the queue is too big, wait for the worker to empty it.
            while len(events_batch) > queue_size:
                logger.info('Sleeping %d seconds', sleep_seconds)
                time.sleep(sleep_seconds)
            event = (yield)
            # Break the event stream by schema (and revision)
            scid = (event['schema'], event['revision'])
            scid_events, first_timestamp = events[scid]
            scid_events.append(event)
            if stats:
                stats.incr('overall.insertAttempted')
            # Check if the schema queue is too long or too old
            if (len(scid_events) >= batch_size or
                    time.time() - first_timestamp >= batch_time):
                logger.info('%s_%s queue is large or old, flushing', *scid)
                events_batch.append((scid, scid_events))
                del events[scid]
    except GeneratorExit:
        # Allow the worker to complete any work that is
        # already in progress before shutting down.
        logger.info('Stopped main thread via GeneratorExit')
        logger.info('Events when stopped %s', len(events))
        worker.stop()
        worker.join()
    except Exception:
        t = traceback.format_exc()
        logger.warn('Exception caught %s', t)
        raise
    finally:
        # If there are any events remaining in the queue,
        # process them in the main thread before exiting.
        for scid, (scid_events, _) in events.iteritems():
            events_batch.append((scid, scid_events))
        store_sql_events(meta, events_batch, replace=replace,
                         on_insert_callback=partial(insert_stats, stats))


@writes('file')
def log_writer(path, raw=False):
    """Write events to a file on disk."""
    handler = logging.handlers.WatchedFileHandler(path)

    # We want to be able to support multiple file writers
    # within a given Python process, so uniquely
    # identify this logger within Python's logging
    # system by the file's path.
    log = logging.getLogger('Events-' + path)

    log.setLevel(logging.INFO)
    log.addHandler(handler)
    # Don't propagate these events to the global logger
    # used by eventlogging.  We don't want eventlogging
    # daemons to print these event logs to stdout or stderr
    # all the time.
    log.propagate = False

    while 1:
        event = (yield)
        if raw:
            log.info(event)
        else:
            log.info(json.dumps(event, sort_keys=True, check_circular=False))


@writes('tcp')
def zeromq_writer(uri, raw=False):
    """Publish events on a ZeroMQ publisher socket."""
    pub = pub_socket(uri)
    while 1:
        event = (yield)
        if raw:
            pub.send_unicode(event)
        else:
            pub.send_unicode(json.dumps(event,
                                        sort_keys=True,
                                        check_circular=False) + '\n')


@writes('statsd')
def statsd_writer(hostname, port, prefix='eventlogging.schema'):
    """Increments StatsD SCID counters for each event."""
    addr = socket.gethostbyname(hostname), port
    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    while 1:
        event = (yield)
        stat = prefix + '.%(schema)s:1|c' % event
        sock.sendto(stat.encode('utf-8'), addr)


@writes('stdout')
def stdout_writer(uri, raw=False):
    """Writes events to stdout. Pretty-prints if stdout is a terminal."""
    dumps_kwargs = dict(sort_keys=True, check_circular=False)
    if sys.stdout.isatty():
        dumps_kwargs.update(indent=2)
    while 1:
        event = (yield)
        if raw:
            print(event)
        else:
            print(json.dumps(event, **dumps_kwargs))


@writes('udp')
def udp_writer(hostname, port, raw=False):
    """Writes data to UDP."""
    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    while 1:
        event = (yield)
        if raw:
            sock.sendto(event, (hostname, port))
        else:
            sock.sendto(json.dumps(event), (hostname, port))

#
# Readers
#


@reads('stdin')
def stdin_reader(uri, raw=False):
    """Reads data from standard input."""
    return stream(sys.stdin, raw)


@reads('tcp')
def zeromq_subscriber(uri, identity=None, subscribe='', raw=False):
    """Reads data from a ZeroMQ publisher. If `raw` is truthy, reads
    unicode strings. Otherwise, reads JSON."""
    sock = sub_socket(uri, identity=identity, subscribe=subscribe)
    return stream(sock, raw)


@reads('udp')
def udp_reader(hostname, port, raw=False):
    """Reads data from a UDP socket."""
    return stream(udp_socket(hostname, port), raw)


@reads('kafka')
def kafka_reader(
    path,
    topic='eventlogging',
    identity='',
    raw=False,
    **kafka_consumer_args
):
    """
    Reads events from Kafka.

    Kafka URIs look like:
    kafka:///b1:9092,b2:9092?topic=topic_name&identity=consumer_group_name&
    auto_commit_enable=True&auto_commit_interval_ms=1000...

    This reader uses the pykafka BalancedConsumer.  You may pass
    any configs that BalancedConsumer takes as keyword arguments via
    the kafka URI query params.

    The auto_commit_interval_ms is by default 60 seconds. This is pretty high
    and may lead to more duplicate message consumption (Kafka has at atleast
    once message delivery guarantee). Lowering this(to 1 second?) makes sure
    that there aren't as many duplicates, but incurs the overhead of committing
    offsets to zookeeper more often.

    If auto_commit_enable is True, then messages will be marked as done based
    on the auto_commit_interval_ms time period.
    This has the downside of committing message offsets before
    work might be actually complete.  E.g. if inserting into MySQL, and
    the process dies somewhere along the way, it is possible
    that message offsets will be committed to Kafka for messages
    that have not been inserted into MySQL.  Future work
    will have to fix this problem somehow.  Perhaps a callback?
    """
    # The identity param is used to define the consumer group name.
    # If identity is empty create a default unique one. This ensures we don't
    # accidentally put consumers to the same group. Explicitly specify identity
    # to launch consumers in the same consumer group
    identity = identity if identity else 'eventlogging-' + str(uuid.uuid1())

    # Brokers should be in the uri path
    # path.strip returns type 'unicode' and pykafka expects a string, so
    # converting unicode to str
    brokers = path.strip('/').encode('ascii', 'ignore')

    # remove non KafkaConsumer args from kafka_consumer_args
    kafka_consumer_args = {
        k: v for k, v in items(kafka_consumer_args)
        if k in inspect.getargspec(BalancedConsumer.__init__).args
    }

    kafka_client = PyKafkaClient(hosts=brokers)
    kafka_topic = kafka_client.topics[topic]

    consumer = kafka_topic.get_balanced_consumer(
        consumer_group=identity.encode('ascii', 'ignore'),
        **kafka_consumer_args)

    # Define a generator to read from the BalancedConsumer instance
    def message_stream(consumer):
        while True:
            yield consumer.consume()

    return stream((message.value for message in message_stream(consumer)), raw)