#!/usr/bin/env python3 import errno import logging import os import re import shutil import sys from glob import glob import filetype import markovify import requests import youtube_dl from telegram import MessageEntity from telegram.ext import Updater, CommandHandler, MessageHandler, Filters DIR = os.path.dirname(os.path.realpath(__file__)) TMP_DIR = '/var/tmp' OUT_DIR = DIR + '/out' logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("kunsax") markov = None def add_to_corpus(text): global markov text = text.lower() new_sentence = markovify.NewlineText(text) markov = markovify.combine([markov, new_sentence]) with open("initial.txt", 'a') as f: f.write(text + '\n') def datestr(date): return date.strftime("%Y-%m-%d@%H%M") def mkdir_p(path): try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def ytdl_has(url): ies = youtube_dl.extractor.gen_extractors() for ie in ies: if ie.suitable(url) and ie.IE_NAME != 'generic' \ and '/channel/' not in url: # Site has dedicated extractor return True return False def download_ydl(urls, subdir, date, extract=False, filename=None): ydl_opts = { 'noplaylist': True, 'restrictfilenames': True, 'outtmpl': TMP_DIR + '/' + datestr(date) + '__%(title)s__%(id)s.%(ext)s' } if extract: ydl_opts['format'] = 'bestaudio' # ydl_opts['postprocessors'] = [{ # 'key': 'FFmpegExtractAudio', # 'preferredcodec': 'wav' # }] with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(urls) out_dir = OUT_DIR + '/' + subdir + '/' for filename in map(ydl.prepare_filename, map(ydl.extract_info, urls)): globbeds = glob(os.path.splitext(filename)[0] + '.*') for globbed in globbeds: logger.info("Moving %s to %s..." % (globbed, out_dir)) shutil.move(globbed, out_dir) def download_raw(urls, subdir, date, extract=False, filename=None): for url in urls: local_filename = OUT_DIR + '/' + subdir + '/' + "%s__%s" % (datestr(date), filename or url.split('/')[-1]) r = requests.get(url, stream=True) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) if not re.match(r'.*\..{3,5}$', os.path.split(local_filename)[-1]): kind = filetype.guess(local_filename) if kind is None: logger.error("File has no extension and could not be determined!") else: logger.info('Moving file without extension... %s?' % kind.extension) shutil.move(local_filename, local_filename + '.' + kind.extension) last_hashtag = None def get_first_hashtag(message): global last_hashtag hashtags = list(map(message.parse_entity, list(filter(lambda e: e.type == 'hashtag', message.entities)))) hashtags += list(map(message.parse_caption_entity, list(filter(lambda e: e.type == 'hashtag', message.caption_entities)))) if len(hashtags) == 0: if last_hashtag is not None and last_hashtag[0] == message.from_user: prehashtag = last_hashtag[1] last_hashtag = None else: return None else: prehashtag = hashtags[0] hashtag = prehashtag[1:].upper() if "PRAS" in hashtag: hashtag = "PRAS" return hashtag def handle_hashtag(bot, update): global last_hashtag hashtags = list(map(update.message.parse_entity, list(filter(lambda e: e.type == 'hashtag', update.message.entities)))) if len(hashtags) > 0: last_hashtag = (update.message.from_user, hashtags[0]) # noinspection PyBroadException def handle(urls, message, download, filename=None): try: hashtag = get_first_hashtag(message) if hashtag is None: logger.info("Ignoring %s due to no hashtag present..." % urls) return logger.info("Downloading %s" % urls) reply = 'Downloading' if hashtag: mkdir_p(OUT_DIR + '/' + hashtag) reply += ' to "' + hashtag + '"' reply += '...' if hashtag == 'AUDIO' and download != download_raw: reply += ' (And also guessing you want to extract the audio)' message.reply_text(reply) download(urls, hashtag or '.', message.date, extract=(hashtag == 'AUDIO'), filename=filename) except: _, exc_value, __ = sys.exc_info() if "Timed out" not in str(exc_value): message.reply_text("Something is FUCKED: %s" % exc_value) def handle_url(bot, update): ytdl_urls = list(filter(ytdl_has, map(lambda e: update.message.parse_entity(e), filter(lambda e: e.type == 'url', update.message.entities)))) if len(ytdl_urls) > 0: handle(ytdl_urls, update.message, download_ydl) # noinspection PyBroadException def handle_rest(bot, update): file, filename = None, None if len(update.message.photo) > 0: photo = max(update.message.photo, key=lambda p: p.width) file = photo.file_id elif update.message.document is not None: filename = update.message.document.file_name file = update.message.document.file_id elif update.message.audio is not None: filename = update.message.audio.title file = update.message.audio.file_id elif update.message.video is not None: file = update.message.video.file_id elif update.message.video_note is not None: file = update.message.video_note.file_id elif update.message.voice is not None: file = update.message.voice.file_id if file is not None: url = bot.getFile(file).file_path handle([url], update.message, download_raw, filename=filename) def handle_text(bot, update): add_to_corpus(update.message.text) def start(bot, update): update.message.reply_text(markov.make_sentence()) def error(bot, update, error): logger.error(error) if "Timed out" in str(error): if update is not None: update.message.reply_text(markov.make_sentence(tries=100) or "Mmmm, I like it...") handle_rest(bot, update) else: if update is not None: update.message.reply_text("Something is fucked: %s" % error) def main(): global markov with open("initial.txt") as f: text = f.read() markov = markovify.NewlineText(text.lower()) logger.info("Sentence of the day: " + markov.make_sentence()) updater = Updater("***REMOVED***") dp = updater.dispatcher dp.add_handler(CommandHandler("start", start)) dp.add_error_handler(error) dp.add_handler(MessageHandler(Filters.entity(MessageEntity.URL), handle_url)) dp.add_handler( MessageHandler( Filters.photo | Filters.video | Filters.video_note | Filters.audio | Filters.voice | Filters.document, handle_rest)) dp.add_handler(MessageHandler(Filters.entity(MessageEntity.HASHTAG), handle_hashtag)) dp.add_handler(MessageHandler(Filters.text, handle_text)) updater.start_polling() logger.info("Started Telegram bot...") updater.idle() if __name__ == '__main__': main()