#!/usr/local/bin/python2.7
# -*- coding: utf-8 -*-
#
# urlwatch is a minimalistic URL watcher written in Python
#
# Copyright (c) 2008-2011 Thomas Perl <thp.io/about>
# All rights reserved.
# 
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. The name of the author may not be used to endorse or promote products
#    derived from this software without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

"""Watch web pages and arbitrary URLs for changes"""

pkgname = 'urlwatch'

__author__ = 'Thomas Perl <m@thp.io>'
__copyright__ = 'Copyright 2008-2011 Thomas Perl'
__license__ = 'BSD'
__homepage__ = 'http://thp.io/2008/urlwatch/'
__version__ = '1.15'

user_agent = '%s/%s (+http://thp.io/2008/urlwatch/info.html)' % (pkgname, __version__)

# Configuration section
display_errors = False
line_length = 75


# File and folder paths
import sys
import os.path

urlwatch_dir = os.path.expanduser(os.path.join('~', '.'+pkgname))
urls_txt = os.path.join(urlwatch_dir, 'urls.txt')
cache_dir = os.path.join(urlwatch_dir, 'cache')
scripts_dir = os.path.join(urlwatch_dir, 'lib')
hooks_py = os.path.join(scripts_dir, 'hooks.py')

# Check if we are installed in the system already
(prefix, bindir) = os.path.split(os.path.dirname(os.path.abspath(sys.argv[0])))

if bindir == 'bin':
    # Assume we are installed in system
    examples_dir = os.path.join(prefix, 'share', 'examples', pkgname)
else:
    # Assume we are not yet installed
    examples_dir = os.path.join(prefix, bindir, 'examples')
    sys.path.append(os.path.join(prefix, bindir, 'lib'))

urls_txt_example = os.path.join(examples_dir, 'urls.txt.example')
hooks_py_example = os.path.join(examples_dir, 'hooks.py.example')

# Code section

import shutil
import os
import stat
import urllib2
import httplib
import email.utils
import time
import socket
import difflib
import datetime
import optparse
import logging
import imp

# Python 3.2 includes "concurrent.futures", for older versions,
# use "pip install futures" or http://code.google.com/p/pythonfutures/
import concurrent.futures

from urlwatch import handler

# One minute (=60 seconds) timeout for each request to avoid hanging
socket.setdefaulttimeout(60)

log = logging.getLogger(pkgname)
log.setLevel(logging.DEBUG)

class NullHandler(logging.Handler):
    def emit(self, record):
        pass

log.addHandler(NullHandler())

ERROR_MESSAGE_URLS_TXT = """
Error: You need to create a urls.txt file first.'

Place it in %s
An example is available in %s
"""

ERROR_MESSAGE_HOOKS_PY = """
You can also create %s
An example is available in %s
"""

MAX_WORKERS = 10

def foutput(type, url, content=None, summary=None, c='*', n=line_length):
    """Format output messages
    
    Returns a snippet of a specific message type (i.e. 'changed') for
    a specific URL and an optional (possibly multi-line) content.

    The parameter "summary" (if specified) should be a list variable
    that gets one item appended for the summary of the changes.

    The return value is a list of strings (one item per line).
    """
    summary_txt = ': '.join((type.upper(), str(url)))

    if summary is not None:
        if content is None:
            summary.append(summary_txt)
        else:
            summary.append('%s (%d bytes)' % (summary_txt, len(str(content))))

    result = [c*n, summary_txt]
    if content is not None:
        result += [c*n, str(content)]
    result += [c*n, '', '']

    return result


if __name__ == '__main__':
    start = datetime.datetime.now()

    # Option parser
    parser = optparse.OptionParser(usage='%%prog [options]\n\n%s' % __doc__.strip(), version=pkgname+' '+__version__)
    parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Show debug/log output')
    parser.add_option('', '--urls', dest='urls', metavar='FILE', help='Read URLs from the specified file')
    parser.add_option('', '--hooks', dest='hooks', metavar='FILE', help='Use specified file as hooks.py module')
    parser.add_option('-e', '--display-errors', action='store_true', dest='display_errors', help='Include HTTP errors (404, etc..) in the output')

    parser.set_defaults(verbose=False, display_errors=False)

    (options, args) = parser.parse_args(sys.argv)

    if options.verbose:
        # Enable logging to the console
        console = logging.StreamHandler()
        console.setLevel(logging.DEBUG)
        formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s')
        console.setFormatter(formatter)
        log.addHandler(console)
        log.info('turning on verbose logging mode')

    if options.display_errors:
        log.info('turning display of errors ON')
        display_errors = True

    if options.urls:
        if os.path.isfile(options.urls):
            urls_txt = options.urls
            log.info('using %s as urls.txt' % options.urls)
        else:
            log.error('%s is not a file' % options.urls)
            print 'Error: %s is not a file' % options.urls
            sys.exit(1)

    if options.hooks:
        if os.path.isfile(options.hooks):
            hooks_py = options.hooks
            log.info('using %s as hooks.py' % options.hooks)
        else:
            log.error('%s is not a file' % options.hooks)
            print 'Error: %s is not a file' % options.hooks
            sys.exit(1)

    # Created all needed folders
    for needed_dir in (urlwatch_dir, cache_dir, scripts_dir):
        if not os.path.isdir(needed_dir):
            os.makedirs(needed_dir)

    # Check for required files
    if not os.path.isfile(urls_txt):
        log.warning('not a file: %s' % urls_txt)
        urls_txt_fn = os.path.join(os.path.dirname(urls_txt), os.path.basename(urls_txt_example))
        hooks_py_fn = os.path.join(os.path.dirname(hooks_py), os.path.basename(hooks_py_example))
        print ERROR_MESSAGE_URLS_TXT % (urls_txt, urls_txt_fn)
        if not options.hooks:
            print ERROR_MESSAGE_HOOKS_PY % (hooks_py, hooks_py_fn)
        if os.path.exists(urls_txt_example) and not os.path.exists(urls_txt_fn):
            shutil.copy(urls_txt_example, urls_txt_fn)
        if not options.hooks and os.path.exists(hooks_py_example) and not os.path.exists(hooks_py_fn):
            shutil.copy(hooks_py_example, hooks_py_fn)
        sys.exit(1)

    headers = {
            'User-agent': user_agent,
    }

    summary = []
    details = []
    count = 0

    filter_func = lambda x, y: y

    if os.path.exists(hooks_py):
        log.info('using hooks.py from %s' % hooks_py)
        hooks = imp.load_source('hooks', hooks_py)
        if hasattr(hooks, 'filter'):
            log.info('found and enabled filter function from hooks.py')
            filter_func = hooks.filter
        else:
            log.warning('hooks.py has no filter function - ignoring')
    else:
        log.info('not using hooks.py (file not found)')

    def process_job(job):
        log.info('now processing: %s', job.location)
        filename = os.path.join(cache_dir, job.get_guid())
        timestamp = None

        if os.path.exists(filename):
            timestamp = os.stat(filename)[stat.ST_MTIME]

        data = job.retrieve(timestamp, filter_func, headers, log)
        return filename, timestamp, data

    jobs = handler.parse_urls_txt(urls_txt)
    log.info('processing %d jobs', len(jobs))

    executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)

    future_to_job = dict((executor.submit(process_job, job), job)
            for job in jobs)

    for future in concurrent.futures.as_completed(future_to_job):
        job = future_to_job[future]

        log.info('job finished: %s' % job.location)

        try:
            exception = future.exception()
            if exception is not None:
                raise exception

            filename, timestamp, data = future.result()

            if os.path.exists(filename):
                log.info('%s exists - creating unified diff' % filename)
                old_data = open(filename).read()

                if (not isinstance(old_data, unicode) and
                        isinstance(data, unicode)):
                    # Fix for Python 2's unicode/str woes
                    data = data.encode('utf-8')

                timestamp_old = email.utils.formatdate(timestamp, localtime=1)
                timestamp_new = email.utils.formatdate(time.time(), localtime=1)
                diff = ''.join(difflib.unified_diff(\
                        old_data.splitlines(1), \
                        data.splitlines(1), \
                        '@', \
                        '@', \
                        timestamp_old, \
                        timestamp_new))
                if len(diff) > 0:
                    log.info('%s has changed - adding diff' % job)
                    details += foutput('changed', job, diff, summary)
                else:
                    log.info('%s has not changed' % job)
            else:
                log.info('%s does not exist - is considered "new"' % filename)
                details += foutput('new', job, None, summary)
            log.info('writing current content of %s to %s' % (job, filename))
            try:
                open(filename, 'w').write(data)
            except UnicodeEncodeError:
                # Happens in Python 2 when data contains non-ascii characters
                open(filename, 'w').write(data.encode('utf-8'))
        except urllib2.HTTPError, error:
            if error.code == 304:
                log.info('%s has not changed (HTTP 304)' % job)
            else:
                log.error('got HTTPError while loading url: %s' % error)
                if display_errors:
                    details += foutput('error', job, error, summary)
        except handler.ShellError, error:
            log.error('Shell returned %d' % error.result)
            if display_errors:
                details += foutput('error', job, error, summary)
        except urllib2.URLError, error:
            log.error('got URLError while loading url: %s' % error)
            if display_errors:
                details += foutput('error', job, error, summary)
        except IOError, error:
            log.error('got IOError while loading url: %s' % error)
            if display_errors:
                details += foutput('error', job, error, summary)
        except socket.timeout, error:
            log.error('got timeout while loading url: %s' % error)
            if display_errors:
                details += foutput('error', job, error, summary)
        except httplib.error, error:
            # This is to workaround a bug in urllib2, see
            # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=529740
            log.error('got httplib error while loading url: %s' % error)
            if display_errors:
                details += foutput('error', job, (repr(error) +
                        '\n' + str(error)).strip(), summary)

        count += 1

    end = datetime.datetime.now()

    # Output everything
    if len(summary) > 1:
        log.info('printing summary with %d items' % len(summary))
        print '-'*line_length
        print 'summary: %d changes' % (len(summary),)
        print ''
        for id, line in enumerate(summary):
            print '%02d. %s' % (id+1, line)
        print '-'*line_length
        print '\n\n\n'
    else:
        log.info('summary is too short - not printing')
    if len(details) > 1:
        log.info('printing details with %d items' % len(details))
        print '\n'.join(details)
        print '-- '
        print '%s %s, %s' % (pkgname, __version__, __copyright__)
        print 'Website: %s' % (__homepage__,)
        print 'watched %d URLs in %d seconds\n' % (count, (end-start).seconds)
    else:
        log.info('no details collected - not printing')

