#!/usr/bin/env python3 import errno import logging import os import re import shutil import sys from configparser import ConfigParser from datetime import datetime from glob import glob from operator import itemgetter import filetype import markovify import mutagen.id3 import pytumblr import requests import telegram import youtube_dl from telegram import MessageEntity from telegram.ext import Updater, CommandHandler, MessageHandler, Filters from youtube_dl.version import __version__ as YTDL_VERSION def mkdir_p(path): try: os.makedirs(path) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def datestr(date): return date.strftime("%Y-%m-%d@%H%M") class DelojzaBot: def __init__(self, tg_api_key, out_dir, tmp_dir='/var/tmp', tumblr_keys=None, markov=None): self.logger = logging.getLogger("delojza") self.out_dir = out_dir self.logger.debug('OUT_DIR: ' + out_dir) self.tmp_dir = tmp_dir self.logger.debug('TMP_DIR: ' + tmp_dir) self.markov = markov self.updater = Updater(tg_api_key) dp = self.updater.dispatcher dp.add_handler(CommandHandler("start", self.tg_start)) dp.add_error_handler(self.tg_error) self.tg_url_handler = MessageHandler(Filters.entity(MessageEntity.URL), self.tg_handle_url) dp.add_handler(self.tg_url_handler) self.tg_rest_handler = MessageHandler(Filters.photo | Filters.video | Filters.video_note | Filters.audio | Filters.voice | Filters.document, self.tg_handle_rest) dp.add_handler(self.tg_rest_handler) dp.add_handler(MessageHandler(Filters.entity(MessageEntity.HASHTAG), self.tg_handle_hashtag)) dp.add_handler(MessageHandler(Filters.text, self.tg_handle_text)) dp.add_handler(CommandHandler("stats", self.tg_stats)) dp.add_handler(CommandHandler("orphans", self.tg_orphan)) dp.add_handler(CommandHandler("orphans_full", self.tg_orphan_full)) dp.add_handler(CommandHandler("version", self.tg_version)) if tumblr_keys: self.client = pytumblr.TumblrRestClient(*tumblr_keys) self.last_hashtag = None @staticmethod def ytdl_can(url): ies = youtube_dl.extractor.gen_extractors() for ie in ies: if ie.suitable(url) and ie.IE_NAME != 'generic' \ and '/channel/' not in url: # Site has dedicated extractor return True return False @staticmethod def extract_tags(info): title = None artist = None if 'track' in info: title = info['track'] if 'artist' in info: artist = info['artist'] if title is None and artist is None and '-' in info['title']: split = info['title'].split("-") artist = split[0] title = split[1] if title is None: title = info['title'] if 'soundcloud' in info['extractor']: artist = info['uploader'] return artist.strip() if artist is not None else None, title.strip() if title is not None else None def download_ytdl(self, urls, subdir, date, message, extract=False, filename=None): ydl_opts = { 'noplaylist': True, 'restrictfilenames': True, 'outtmpl': os.path.join(self.tmp_dir, '{}__%(title)s__%(id)s.%(ext)s'.format(datestr(date))) } if extract: ydl_opts['format'] = 'bestaudio/best' ydl_opts['postprocessors'] = [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '256' }] with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(urls) out_dir = os.path.join(self.out_dir, subdir) for info in [ydl.extract_info(url, download=False) for url in urls]: filename = ydl.prepare_filename(info) globbeds = glob(os.path.splitext(filename)[0] + '.*') for globbed in globbeds: if globbed.endswith("mp3"): artist, title = self.extract_tags(info) message.reply_text("Tagging as \"{}\" by \"{}\"".format(title, artist)) self.logger.info("Tagging %s w/ $s - $s...".format(globbed, title, artist)) try: id3 = mutagen.id3.ID3(globbed) except mutagen.id3.ID3NoHeaderError: mutafile = mutagen.File(globbed) mutafile.add_tags() mutafile.save() id3 = mutagen.id3.ID3(globbed) id3.add(mutagen.id3.TIT2(encoding=3, text=title)) if artist: id3.add(mutagen.id3.TOPE(encoding=3, text=artist)) id3.add(mutagen.id3.TPE1(encoding=3, text=artist)) id3.save() self.logger.info("Moving %s to %s..." % (globbed, out_dir)) shutil.move(globbed, out_dir) return [] def download_raw(self, urls, subdir, date, _, extract=False, filename=None): filenames = [] for url in urls: local_filename = os.path.join(self.out_dir, subdir, "%s__%s" % (datestr(date), filename or url.split('/')[-1])) r = requests.get(url, stream=True) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) if not re.match(r'.*\..{3,5}$', os.path.split(local_filename)[-1]): kind = filetype.guess(local_filename) if kind is None: self.logger.error("File has no extension and could not be determined!") else: self.logger.info('Moving file without extension... %s?' % kind.extension) shutil.move(local_filename, local_filename + '.' + kind.extension) filenames.append(local_filename) return filenames @staticmethod def extract_first_hashtag(message): hashtags = list(map(message.parse_entity, list(filter(lambda e: e.type == 'hashtag', message.entities)))) hashtags += list(map(message.parse_caption_entity, list(filter(lambda e: e.type == 'hashtag', message.caption_entities)))) if len(hashtags) > 0: hashtag = hashtags[0][1:].upper() if "PRAS" in hashtag: hashtag = "PRAS" return hashtag def get_hashtag(self, message): hashtag = self.extract_first_hashtag(message) if hashtag is None: if self.last_hashtag is not None and self.last_hashtag[0] == message.from_user: hashtag = self.last_hashtag[1] self.last_hashtag = None return hashtag def tg_handle_hashtag(self, bot, update): hashtag = self.extract_first_hashtag(update.message) if update.message.reply_to_message: self.handle_tg_message(update.message.reply_to_message, bot, hashtag) self.handle_urls(update.message.reply_to_message, hashtag) else: self.last_hashtag = (update.message.from_user, hashtag) # noinspection PyBroadException def handle(self, urls, message, hashtag, download_fn, filename=None): try: if hashtag is None: self.logger.info("Ignoring %s due to no hashtag present..." % urls) return self.logger.info("Downloading %s under '%s'" % (urls, hashtag)) reply = 'Downloading' if hashtag: mkdir_p(os.path.join(self.out_dir, hashtag)) reply += ' to "' + hashtag + '"' reply += '...' extract = False if any([tag in hashtag for tag in ('AUDIO', 'RADIO')]) and download_fn != self.download_raw: extract = True reply += ' (And also guessing you want to extract the audio)' message.reply_text(reply) filenames = download_fn(urls, hashtag or '.', message.date, message, extract=extract, filename=filename) if hashtag == 'TUMBLR' and self.client: message.reply_text('(btw, queueing to tumblr)') for filename in filenames: self.client.create_photo('kunsaxan', state="queue", data=filename) elif hashtag == 'TUMBLR_NOW' and self.client: message.reply_text('(btw, ***FIRING TO TUMBLR RIGHT AWAY***)', parse_mode=telegram.ParseMode.MARKDOWN) for filename in filenames: self.client.create_photo('kunsaxan', state="published", data=filename) return filenames except: _, exc_value, __ = sys.exc_info() if "Timed out" not in str(exc_value): message.reply_text("Something is FUCKED: %s" % exc_value) def tg_handle_url(self, _, update): self.handle_urls(update.message, self.get_hashtag(update.message)) def handle_urls(self, message, hashtag): urls = list(map(lambda e: message.parse_entity(e), filter(lambda e: e.type == 'url', message.entities))) ytdl_urls = [url for url in urls if self.ytdl_can(url)] normal_urls = [url for url in urls if not self.ytdl_can(url)] if len(ytdl_urls) > 0: self.handle(ytdl_urls, message, hashtag, self.download_ytdl) if len(normal_urls) > 0: image_urls = [url for url in normal_urls if "image" in requests.head(url).headers.get("Content-Type", "")] if len(image_urls) > 0: self.handle(image_urls, message, hashtag, self.download_raw) # noinspection PyBroadException def tg_handle_rest(self, bot, update): self.handle_tg_message(update.message, bot, self.get_hashtag(update.message)) def handle_tg_message(self, message, bot, hashtag): file, filename, tumblr = None, None, False if len(message.photo) > 0: photo = max(message.photo, key=lambda p: p.width) file = photo.file_id elif message.document is not None: filename = message.document.file_name file = message.document.file_id elif message.audio is not None: filename = message.audio.title file = message.audio.file_id elif message.video is not None: file = message.video.file_id elif message.video_note is not None: file = message.video_note.file_id elif message.voice is not None: file = message.voice.file_id if file is not None: url = bot.getFile(file).file_path self.handle([url], message, hashtag, self.download_raw, filename=filename) def tg_handle_text(self, _, update): if self.markov: self.markov.add_to_corpus(update.message.text) def tag_dirs(self): return list(filter(lambda x: x.upper() == x, filter(lambda dir: os.path.isdir(os.path.join(self.out_dir, dir)), os.listdir(self.out_dir)))) def tg_stats(self, _, update): tag_dirs = self.tag_dirs() reply = "Total number of tags: {}\n\n".format(len(tag_dirs)) counts = [(dir, os.listdir(os.path.join(self.out_dir, dir))) for dir in tag_dirs] counts.sort(key=itemgetter(0)) counts.sort(key=lambda x: len(x[1]), reverse=True) for dir, files in counts: if len(files) == 1: break abs_paths = [os.path.join(self.out_dir, dir, file) for file in files] abs_files = list(filter(os.path.isfile, abs_paths)) # mimes = [magic.from_file(path, mime=True).split("/")[0] for path in abs_files] # mime_counts = [(mime, mimes.count(mime)) for mime in set(mimes)] exts = [ext[1:] for ext in [os.path.splitext(path)[1] for path in abs_files] if len(ext) > 0] ext_counts = [(ext, exts.count(ext)) for ext in set(exts)] dir_cnt = len(abs_paths) - len(abs_files) type_counts = ext_counts + ([("directorie", dir_cnt)] if dir_cnt > 0 else []) details = ", ".join(["{} {}s".format(cnt, mime) for mime, cnt in sorted(type_counts, key=itemgetter(1), reverse=True)]) if len(type_counts) == 1: reply += "{}: {}\n".format(dir, details) else: reply += "{}: {} files ({})\n".format(dir, len(files), details) orphans = list(filter(lambda cnt: len(cnt[1]) <= 1, counts)) if len(orphans) > 0: reply += "\nFollowing tags are orphans: " + ", ".join(map(itemgetter(0), orphans)) update.message.reply_text(reply, parse_mode=telegram.ParseMode.HTML) def orphans(self): result = [] tag_dirs = self.tag_dirs() for dir in tag_dirs: files = os.listdir(os.path.join(self.out_dir, dir)) if len(files) == 1: result.append((dir, files[0])) if len(files) == 0: result.append((dir, "NO FILE AT ALL...")) return sorted(result, key=itemgetter(0)) def tg_orphan(self, _, update): orphans = self.orphans() if len(orphans) == 0: update.message.reply_text("Good job, no orphan tags!") else: update.message.reply_text("The following tags only contain a single file:\n" + ", ".join(map(itemgetter(0), orphans))) def tg_orphan_full(self, _, update): orphans = self.orphans() if len(orphans) == 0: update.message.reply_text("Good job, no orphan tags!") else: tmp_reply = "The following tags only contain a single file:\n" for dir, file in orphans: line = "{}: {}\n".format(dir, file) if len(tmp_reply + line) > 4096: update.message.reply_text(tmp_reply) tmp_reply = "" tmp_reply += line if len(tmp_reply) > 0: update.message.reply_text(tmp_reply) def tg_version(self, _, update): delojza_date = datetime.fromtimestamp(os.path.getmtime(os.path.realpath(__file__))) \ .strftime('%Y/%m/%d - %H:%M:%S') update.message.reply_text("delojza modified date: {}\nyoutube-dl version: {}" .format(delojza_date, YTDL_VERSION)) def tg_start(self, _, update): update.message.reply_text(self.markov.make_sentence() if self.markov else "HELLO") def tg_error(self, bot, update, error): self.logger.error(error) if "Timed out" in str(error): if update is not None: default = "Mmmm, I like it..." update.message.reply_text((self.markov.make_sentence(tries=100) if self.markov else default) or default) self.tg_handle_rest(bot, update) else: if update is not None: update.message.reply_text("Something is fucked: %s" % error) def run_idle(self): self.updater.start_polling() self.logger.info("Started Telegram bot...") self.updater.idle() class MarkovBlabberer: def __init__(self, filepath): self.logger = logging.getLogger('markov') self.filepath = filepath with open(filepath) as f: text = f.read() self.markov = markovify.NewlineText(text.lower()) self.logger.info("Sentence of the day: " + self.make_sentence()) def make_sentence(self, tries=100): return self.markov.make_sentence(tries=tries) def add_to_corpus(self, text): text = text.lower() new_sentence = markovify.NewlineText(text) self.markov = markovify.combine([self.markov, new_sentence]) with open(self.filepath, 'a') as f: f.write(text + '\n') if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') _DIR_ = os.path.dirname(os.path.realpath(__file__)) CONFIG_PATHS = ['/etc/delojza/delojza.ini', os.path.join(os.getenv("HOME") or "", ".config/delojza/delojza.ini"), os.path.join(_DIR_, "delojza.ini")] config = ConfigParser() try: CONF_FILE = next(conf_path for conf_path in CONFIG_PATHS if os.path.isfile(conf_path)) config.read(CONF_FILE) except StopIteration: logging.error("No config file found, stopping.") sys.exit(-1) markov = MarkovBlabberer("initial.txt") delojza = DelojzaBot(config.get('delojza', 'tg_api_key'), config.get('delojza', 'OUT_DIR', fallback=os.path.join(_DIR_, "out")), tmp_dir=config.get('delojza', 'tmp_dir', fallback="/var/tmp"), tumblr_keys=(config.get('tumblr', 'consumer_key'), config.get('tumblr', 'consumer_secret'), config.get('tumblr', 'oauth_key'), config.get('tumblr', 'oauth_secret')), markov=markov) delojza.run_idle()