delojza/delojza.py

#!/usr/bin/env python3

import errno
import logging
import os
import re
import shutil
import sys
from configparser import ConfigParser
from datetime import datetime
from glob import glob
from operator import itemgetter

import acoustid
import filetype
import markovify
import mutagen.id3
import pytumblr
import requests
import telegram
import youtube_dl
from telegram import MessageEntity
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
from youtube_dl.version import __version__ as YTDL_VERSION


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def datestr(date):
    return date.strftime("%Y-%m-%d@%H%M")


class DelojzaBot:
    def __init__(self, tg_api_key, out_dir, tmp_dir='/var/tmp', acoustid_key=None, tumblr_keys=None, markov=None):
        self.logger = logging.getLogger("delojza")

        self.out_dir = out_dir
        self.logger.debug('OUT_DIR: ' + out_dir)
        self.tmp_dir = tmp_dir
        self.logger.debug('TMP_DIR: ' + tmp_dir)
        self.markov = markov

        self.updater = Updater(tg_api_key)
        dp = self.updater.dispatcher

        dp.add_handler(CommandHandler("start", self.tg_start))
        dp.add_error_handler(self.tg_error)
        self.tg_url_handler = MessageHandler(Filters.entity(MessageEntity.URL), self.tg_handle_url)
        dp.add_handler(self.tg_url_handler)
        self.tg_rest_handler = MessageHandler(Filters.photo | Filters.video | Filters.video_note |
                                              Filters.audio | Filters.voice | Filters.document, self.tg_handle_rest)
        dp.add_handler(self.tg_rest_handler)
        dp.add_handler(MessageHandler(Filters.entity(MessageEntity.HASHTAG), self.tg_handle_hashtag))
        dp.add_handler(MessageHandler(Filters.text, self.tg_handle_text))
        dp.add_handler(CommandHandler("stats", self.tg_stats))
        dp.add_handler(CommandHandler("orphans", self.tg_orphan))
        dp.add_handler(CommandHandler("orphans_full", self.tg_orphan_full))
        dp.add_handler(CommandHandler("delete", self.tg_delete))
        dp.add_handler(CommandHandler("version", self.tg_version))

        self.acoustid_key = acoustid_key

        if tumblr_keys:
            self.client = pytumblr.TumblrRestClient(*tumblr_keys)

        self.last_downloaded = []
        self.last_hashtag = None

    @staticmethod
    def ytdl_can(url):
        ies = youtube_dl.extractor.gen_extractors()
        for ie in ies:
            if ie.suitable(url) and ie.IE_NAME != 'generic' \
                    and '/channel/' not in url:
                # Site has dedicated extractor
                return True
        return False

    def tag_file(self, filepath, message, info=None):
        if info is None:
            info = {}

        title = None
        artist = None
        source = None

        if 'track' in info:
            title = info['track']
        if 'artist' in info:
            artist = info['artist']

        if 'track' in info or 'artist' in info:
            source = "supplied metadata"

        if title is None or artist is None and self.acoustid_key:
            try:
                self.logger.debug("Requesting AcoustID for {}".format(filepath))
                results = sorted(acoustid.match(self.acoustid_key, filepath), key=itemgetter(0), reverse=True)
                if len(results) > 0:
                    score, rid, aid_title, aid_artist = results[0]
                    if score > .8:
                        title = aid_title
                        artist = aid_artist
                        source = "AcoustID ({}%)".format(round(score * 100))
            except acoustid.NoBackendError:
                self.logger.warning("chromaprint library/tool not found")
            except acoustid.FingerprintGenerationError:
                self.logger.warning("fingerprint could not be calculated")
            except acoustid.WebServiceError as exc:
                self.logger.warning("web service request failed: {}".format(exc.message))

        if title is None and artist is None and '-' in info.get("title", ""):
            split = info['title'].split("-")
            artist = split[0]
            title = split[1]
            source = "fallback (artist - title)"

        if title is None and 'title' in info:
            title = info['title']
            source = "full title fallback"

        if 'soundcloud' in info.get("extractor", ""):
            artist = info['uploader']
            source = "soundcloud \"fallback\""

        artist = artist.strip() if artist else None
        title = title.strip() if title else None

        if title is None and artist is None:
            message.reply_text("Tried tagging, found nothing :(")
            return

        message.reply_text("Tagging as \"{}\" by \"{}\"\nvia {}".format(title, artist, source))
        self.logger.info("Tagging {} w/ {} - {} [{}]...".format(filepath, title, artist, source))
        try:
            id3 = mutagen.id3.ID3(filepath)
        except mutagen.id3.ID3NoHeaderError:
            mutafile = mutagen.File(filepath)
            mutafile.add_tags()
            mutafile.save()
            id3 = mutagen.id3.ID3(filepath)
        id3.add(mutagen.id3.TIT2(encoding=3, text=title))
        if artist:
            id3.add(mutagen.id3.TOPE(encoding=3, text=artist))
            id3.add(mutagen.id3.TPE1(encoding=3, text=artist))
        id3.save()

    def download_ytdl(self, urls, subdir, date, message, audio=False, filename=None):
        ydl_opts = {
            'noplaylist': True,
            'restrictfilenames': True,
            'outtmpl': os.path.join(self.tmp_dir, '{}__%(title)s__%(id)s.%(ext)s'.format(datestr(date)))
        }
        if audio:
            ydl_opts['format'] = 'bestaudio/best'
            ydl_opts['postprocessors'] = [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '256'
            }]
        filenames = []
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download(urls)
            out_dir = os.path.join(self.out_dir, subdir)
            for info in [ydl.extract_info(url, download=False) for url in urls]:
                filename = ydl.prepare_filename(info)
                globbeds = glob(os.path.splitext(filename)[0] + '.*')
                for globbed in globbeds:
                    if globbed.endswith("mp3"):
                        self.tag_file(globbed, message, info=info)
                    self.logger.info("Moving %s to %s..." % (globbed, out_dir))
                    dest = shutil.move(globbed, out_dir)
                    filenames.append(dest)
        return filenames

    def download_raw(self, urls, subdir, date, message, audio=False, filename=None):
        filenames = []
        for url in urls:
            local_filename = os.path.join(self.out_dir, subdir,
                                          "%s__%s" % (datestr(date), filename or url.split('/')[-1]))
            final_filename = local_filename
            is_mp3 = local_filename.endswith("mp3")

            r = requests.get(url, stream=True)
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)

            if not re.match(r'.*\..{3,5}$', os.path.split(local_filename)[-1]):
                kind = filetype.guess(local_filename)
                if kind is None:
                    self.logger.error("File has no extension and could not be determined!")
                else:
                    self.logger.info('Moving file without extension... %s?' % kind.extension)
                    final_filename = shutil.move(local_filename, local_filename + '.' + kind.extension)
                    is_mp3 = kind.extension == "mp3"

            filenames.append(final_filename)

            if audio and is_mp3:
                self.tag_file(final_filename, message)

        return filenames

    @staticmethod
    def extract_first_hashtag(message):
        hashtags = list(map(message.parse_entity,
                            list(filter(lambda e: e.type == 'hashtag', message.entities))))
        hashtags += list(map(message.parse_caption_entity,
                             list(filter(lambda e: e.type == 'hashtag', message.caption_entities))))
        if len(hashtags) > 0:
            hashtag = hashtags[0][1:].upper()
            if "PRAS" in hashtag:
                hashtag = "PRAS"
            return hashtag

    def get_hashtag(self, message):
        hashtag = self.extract_first_hashtag(message)
        if hashtag is None:
            if self.last_hashtag is not None and self.last_hashtag[0] == message.from_user:
                hashtag = self.last_hashtag[1]
                self.last_hashtag = None
        return hashtag

    def tg_handle_hashtag(self, bot, update):
        hashtag = self.extract_first_hashtag(update.message)

        if update.message.reply_to_message:
            self.handle_tg_message(update.message.reply_to_message, bot, hashtag)
            self.handle_urls(update.message.reply_to_message, hashtag)
        else:
            self.last_hashtag = (update.message.from_user, hashtag)

    # noinspection PyBroadException
    def handle(self, urls, message, hashtag, download_fn, filename=None):
        try:
            if hashtag is None:
                self.logger.info("Ignoring %s due to no hashtag present..." % urls)
                return

            self.logger.info("Downloading %s under '%s'" % (urls, hashtag))

            reply = 'Downloading'
            if hashtag:
                mkdir_p(os.path.join(self.out_dir, hashtag))
                reply += ' to "' + hashtag + '"'
            reply += '...'

            audio = any([tag in hashtag for tag in ('AUDIO', 'RADIO')])
            if audio and download_fn != self.download_raw:
                reply += ' (And also guessing you want to extract the audio)'

            message.reply_text(reply)

            filenames = download_fn(urls, hashtag or '.', message.date, message, audio=audio, filename=filename)
            if hashtag == 'TUMBLR' and self.client:
                message.reply_text('(btw, queueing to tumblr)')
                for filename in filenames:
                    self.client.create_photo('kunsaxan', state="queue", data=filename)
            elif hashtag == 'TUMBLR_NOW' and self.client:
                message.reply_text('(btw, ***FIRING TO TUMBLR RIGHT AWAY***)',
                                   parse_mode=telegram.ParseMode.MARKDOWN)
                for filename in filenames:
                    self.client.create_photo('kunsaxan', state="published", data=filename)
            self.last_downloaded = filenames
            return filenames
        except:
            _, exc_value, __ = sys.exc_info()
            if "Timed out" not in str(exc_value):
                message.reply_text("Something is FUCKED: %s" % exc_value)

    def tg_handle_url(self, _, update):
        self.handle_urls(update.message, self.get_hashtag(update.message))

    def handle_urls(self, message, hashtag):
        urls = list(map(lambda e: message.parse_entity(e),
                        filter(lambda e: e.type == 'url', message.entities)))
        ytdl_urls = [url for url in urls if self.ytdl_can(url)]
        normal_urls = [url for url in urls if not self.ytdl_can(url)]
        if len(ytdl_urls) > 0:
            self.handle(ytdl_urls, message, hashtag, self.download_ytdl)
        if len(normal_urls) > 0:
            image_urls = [url for url in normal_urls if "image" in requests.head(url).headers.get("Content-Type", "")]
            if len(image_urls) > 0:
                self.handle(image_urls, message, hashtag, self.download_raw)

    # noinspection PyBroadException
    def tg_handle_rest(self, bot, update):
        self.handle_tg_message(update.message, bot, self.get_hashtag(update.message))

    def handle_tg_message(self, message, bot, hashtag):
        file, filename, tumblr = None, None, False
        if len(message.photo) > 0:
            photo = max(message.photo, key=lambda p: p.width)
            file = photo.file_id
        elif message.document is not None:
            filename = message.document.file_name
            file = message.document.file_id
        elif message.audio is not None:
            filename = message.audio.title
            file = message.audio.file_id
        elif message.video is not None:
            file = message.video.file_id
        elif message.video_note is not None:
            file = message.video_note.file_id
        elif message.voice is not None:
            file = message.voice.file_id

        if file is not None:
            url = bot.getFile(file).file_path
            self.handle([url], message, hashtag, self.download_raw, filename=filename)

    def tg_handle_text(self, _, update):
        if self.markov:
            self.markov.add_to_corpus(update.message.text)

    def tag_dirs(self):
        return list(filter(lambda x: x.upper() == x,
                           filter(lambda dir: os.path.isdir(os.path.join(self.out_dir, dir)),
                                  os.listdir(self.out_dir))))

    def tg_stats(self, _, update):
        tag_dirs = self.tag_dirs()
        reply = "Total number of tags: {}\n\n".format(len(tag_dirs))
        counts = [(dir, os.listdir(os.path.join(self.out_dir, dir))) for dir in tag_dirs]
        counts.sort(key=itemgetter(0))
        counts.sort(key=lambda x: len(x[1]), reverse=True)
        for dir, files in counts:
            if len(files) == 1:
                break
            abs_paths = [os.path.join(self.out_dir, dir, file) for file in files]
            abs_files = list(filter(os.path.isfile, abs_paths))
            # mimes = [magic.from_file(path, mime=True).split("/")[0] for path in abs_files]
            # mime_counts = [(mime, mimes.count(mime)) for mime in set(mimes)]
            exts = [ext[1:] for ext in [os.path.splitext(path)[1] for path in abs_files] if len(ext) > 0]
            ext_counts = [(ext, exts.count(ext)) for ext in set(exts)]
            dir_cnt = len(abs_paths) - len(abs_files)
            type_counts = ext_counts + ([("directorie", dir_cnt)] if dir_cnt > 0 else [])
            details = ", ".join(["{} {}s".format(cnt, mime) for mime, cnt in
                                 sorted(type_counts, key=itemgetter(1), reverse=True)])
            if len(type_counts) == 1:
                reply += "<b>{}:</b> {}\n".format(dir, details)
            else:
                reply += "<b>{}:</b> {} files ({})\n".format(dir, len(files), details)
        orphans = list(filter(lambda cnt: len(cnt[1]) <= 1, counts))
        if len(orphans) > 0:
            reply += "\nFollowing tags are orphans: " + ", ".join(map(itemgetter(0), orphans))
        update.message.reply_text(reply, parse_mode=telegram.ParseMode.HTML)

    def orphans(self):
        result = []
        tag_dirs = self.tag_dirs()
        for dir in tag_dirs:
            files = os.listdir(os.path.join(self.out_dir, dir))
            if len(files) == 1:
                result.append((dir, files[0]))
            if len(files) == 0:
                result.append((dir, "NO FILE AT ALL..."))
        return sorted(result, key=itemgetter(0))

    def tg_orphan(self, _, update):
        orphans = self.orphans()
        if len(orphans) == 0:
            update.message.reply_text("Good job, no orphan tags!")
        else:
            update.message.reply_text("The following tags only contain a single file:\n" +
                                      ", ".join(map(itemgetter(0), orphans)))

    def tg_orphan_full(self, _, update):
        orphans = self.orphans()
        if len(orphans) == 0:
            update.message.reply_text("Good job, no orphan tags!")
        else:
            tmp_reply = "The following tags only contain a single file:\n"
            for dir, file in orphans:
                line = "{}: {}\n".format(dir, file)
                if len(tmp_reply + line) > 4096:
                    update.message.reply_text(tmp_reply)
                    tmp_reply = ""
                tmp_reply += line
            if len(tmp_reply) > 0:
                update.message.reply_text(tmp_reply)

    def tg_delete(self, _, update):
        if len(self.last_downloaded) > 0:
            for file in self.last_downloaded:
                update.message.reply_text("Removing \"{}\"!".format(file[len(self.out_dir):]))
                os.remove(file)
                file_parent_dir = os.path.dirname(file)
                if len(os.listdir(file_parent_dir)) == 0:
                    hashtag = os.path.split(file_parent_dir)[1].upper()
                    update.message.reply_text("Removing tag \"{}\" as it's empty...".format(hashtag))
                    os.rmdir(file_parent_dir)
            self.last_downloaded.clear()
        else:
            update.message.reply_text("Nothing to remove!")

    def tg_version(self, _, update):
        delojza_date = datetime.fromtimestamp(os.path.getmtime(os.path.realpath(__file__))) \
            .strftime('%Y/%m/%d - %H:%M:%S')
        update.message.reply_text("delojza modified date: {}\nyoutube-dl version: {}"
                                  .format(delojza_date, YTDL_VERSION))

    def tg_start(self, _, update):
        update.message.reply_text(self.markov.make_sentence() if self.markov else "HELLO")

    def tg_error(self, bot, update, error):
        self.logger.error(error)
        if "Timed out" in str(error):
            if update is not None:
                default = "Mmmm, I like it..."
                update.message.reply_text((self.markov.make_sentence(tries=100) if self.markov else default) or default)
                self.tg_handle_rest(bot, update)
        else:
            if update is not None:
                update.message.reply_text("Something is fucked: %s" % error)

    def run_idle(self):
        self.updater.start_polling()
        self.logger.info("Started Telegram bot...")
        self.updater.idle()


class MarkovBlabberer:
    def __init__(self, filepath):
        self.logger = logging.getLogger('markov')
        self.filepath = filepath

        with open(filepath) as f:
            text = f.read()
            self.markov = markovify.NewlineText(text.lower())
            self.logger.info("Sentence of the day: " + self.make_sentence())

    def make_sentence(self, tries=100):
        return self.markov.make_sentence(tries=tries)

    def add_to_corpus(self, text):
        text = text.lower()
        new_sentence = markovify.NewlineText(text)
        self.markov = markovify.combine([self.markov, new_sentence])
        with open(self.filepath, 'a') as f:
            f.write(text + '\n')


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    _DIR_ = os.path.dirname(os.path.realpath(__file__))
    CONFIG_PATHS = ['/etc/delojza/delojza.ini',
                    os.path.join(os.getenv("HOME") or "", ".config/delojza/delojza.ini"),
                    os.path.join(_DIR_, "delojza.ini")]

    config = ConfigParser()
    try:
        CONF_FILE = next(conf_path for conf_path in CONFIG_PATHS if os.path.isfile(conf_path))
        config.read(CONF_FILE)
    except StopIteration:
        logging.error("No config file found, stopping.")
        sys.exit(-1)

    try:
        markov = MarkovBlabberer("initial.txt")
    except FileNotFoundError:
        markov = None

    delojza = DelojzaBot(config.get('delojza', 'tg_api_key'),
                         config.get('delojza', 'OUT_DIR', fallback=os.path.join(_DIR_, "out")),
                         tmp_dir=config.get('delojza', 'tmp_dir', fallback="/var/tmp"),
                         acoustid_key=config.get('delojza', 'acoustid_api_key'),
                         tumblr_keys=(config.get('tumblr', 'consumer_key'),
                                      config.get('tumblr', 'consumer_secret'),
                                      config.get('tumblr', 'oauth_key'),
                                      config.get('tumblr', 'oauth_secret')),
                         markov=markov)
    delojza.run_idle()