delojza/delojza.py

#!/usr/bin/env python3
import errno
import logging
import os
import re
import shutil
import sys
from glob import glob

import filetype
import markovify
import requests
import youtube_dl
from telegram import MessageEntity
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters

DIR = os.path.dirname(os.path.realpath(__file__))
TMP_DIR = '/var/tmp'
OUT_DIR = DIR + '/out'

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("kunsax")

markov = None


def add_to_corpus(text):
    global markov
    text = text.lower()
    new_sentence = markovify.NewlineText(text)
    markov = markovify.combine([markov, new_sentence])
    with open("initial.txt", 'a') as f:
        f.write(text + '\n')


def datestr(date):
    return date.strftime("%Y-%m-%d@%H%M")


def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def ytdl_has(url):
    ies = youtube_dl.extractor.gen_extractors()
    for ie in ies:
        if ie.suitable(url) and ie.IE_NAME != 'generic' \
                and '/channel/' not in url:
            # Site has dedicated extractor
            return True
    return False


def download_ydl(urls, subdir, date, extract=False, filename=None):
    ydl_opts = {
        'noplaylist': True,
        'restrictfilenames': True,
        'outtmpl': TMP_DIR + '/' + datestr(date) + '__%(title)s__%(id)s.%(ext)s'
    }
    if extract:
        ydl_opts['format'] = 'bestaudio'
        # ydl_opts['postprocessors'] = [{
        #     'key': 'FFmpegExtractAudio',
        #     'preferredcodec': 'wav'
        # }]
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download(urls)
        out_dir = OUT_DIR + '/' + subdir + '/'
        for filename in map(ydl.prepare_filename, map(ydl.extract_info, urls)):
            globbeds = glob(os.path.splitext(filename)[0] + '.*')
            for globbed in globbeds:
                logger.info("Moving %s to %s..." % (globbed, out_dir))
                shutil.move(globbed, out_dir)


def download_raw(urls, subdir, date, extract=False, filename=None):
    for url in urls:
        local_filename = OUT_DIR + '/' + subdir + '/' + "%s__%s" % (datestr(date), filename or url.split('/')[-1])
        r = requests.get(url, stream=True)
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        if not re.match(r'.*\..{3,5}$', os.path.split(local_filename)[-1]):
            kind = filetype.guess(local_filename)
            if kind is None:
                logger.error("File has no extension and could not be determined!")
            else:
                logger.info('Moving file without extension... %s?' % kind.extension)
                shutil.move(local_filename, local_filename + '.' + kind.extension)


last_hashtag = None


def get_first_hashtag(message):
    global last_hashtag
    hashtags = list(map(message.parse_entity,
                        list(filter(lambda e: e.type == 'hashtag', message.entities))))
    hashtags += list(map(message.parse_caption_entity,
                         list(filter(lambda e: e.type == 'hashtag', message.caption_entities))))
    if len(hashtags) == 0:
        if last_hashtag is not None and last_hashtag[0] == message.from_user:
            prehashtag = last_hashtag[1]
            last_hashtag = None
        else:
            return None
    else:
        prehashtag = hashtags[0]
    hashtag = prehashtag[1:].upper()
    if "PRAS" in hashtag:
        hashtag = "PRAS"
    return hashtag


def handle_hashtag(bot, update):
    global last_hashtag
    hashtags = list(map(update.message.parse_entity,
                        list(filter(lambda e: e.type == 'hashtag', update.message.entities))))
    if len(hashtags) > 0:
        last_hashtag = (update.message.from_user, hashtags[0])


# noinspection PyBroadException
def handle(urls, message, download, filename=None):
    try:
        hashtag = get_first_hashtag(message)
        if hashtag is None:
            logger.info("Ignoring %s due to no hashtag present..." % urls)
            return

        logger.info("Downloading %s" % urls)

        reply = 'Downloading'
        if hashtag:
            mkdir_p(OUT_DIR + '/' + hashtag)
            reply += ' to "' + hashtag + '"'
        reply += '...'

        if hashtag == 'AUDIO' and download != download_raw:
            reply += ' (And also guessing you want to extract the audio)'
        message.reply_text(reply)
        download(urls,
                 hashtag or '.', message.date,
                 extract=(hashtag == 'AUDIO'),
                 filename=filename)
    except:
        _, exc_value, __ = sys.exc_info()
        if "Timed out" not in str(exc_value):
            message.reply_text("Something is FUCKED: %s" % exc_value)


def handle_url(bot, update):
    ytdl_urls = list(filter(ytdl_has,
                            map(lambda e: update.message.parse_entity(e),
                                filter(lambda e: e.type == 'url',
                                       update.message.entities))))
    if len(ytdl_urls) > 0:
        handle(ytdl_urls, update.message, download_ydl)


# noinspection PyBroadException
def handle_rest(bot, update):
    file, filename = None, None
    if len(update.message.photo) > 0:
        photo = max(update.message.photo, key=lambda p: p.width)
        file = photo.file_id
    elif update.message.document is not None:
        filename = update.message.document.file_name
        file = update.message.document.file_id
    elif update.message.audio is not None:
        filename = update.message.audio.title
        file = update.message.audio.file_id
    elif update.message.video is not None:
        file = update.message.video.file_id
    elif update.message.video_note is not None:
        file = update.message.video_note.file_id
    elif update.message.voice is not None:
        file = update.message.voice.file_id

    if file is not None:
        url = bot.getFile(file).file_path
        handle([url], update.message, download_raw, filename=filename)


def handle_text(bot, update):
    add_to_corpus(update.message.text)


def start(bot, update):
    update.message.reply_text(markov.make_sentence())


def error(bot, update, error):
    logger.error(error)
    if "Timed out" in str(error):
        if update is not None:
            update.message.reply_text(markov.make_sentence(tries=100) or "Mmmm, I like it...")
            handle_rest(bot, update)
    else:
        if update is not None:
            update.message.reply_text("Something is fucked: %s" % error)


def main():
    global markov

    with open("initial.txt") as f:
        text = f.read()
        markov = markovify.NewlineText(text.lower())
        logger.info("Sentence of the day: " + markov.make_sentence())

    updater = Updater("***REMOVED***")

    dp = updater.dispatcher

    dp.add_handler(CommandHandler("start", start))

    dp.add_error_handler(error)

    dp.add_handler(MessageHandler(Filters.entity(MessageEntity.URL), handle_url))
    dp.add_handler(
        MessageHandler(
            Filters.photo | Filters.video | Filters.video_note | Filters.audio | Filters.voice | Filters.document,
            handle_rest))
    dp.add_handler(MessageHandler(Filters.entity(MessageEntity.HASHTAG), handle_hashtag))
    dp.add_handler(MessageHandler(Filters.text, handle_text))

    updater.start_polling()

    logger.info("Started Telegram bot...")

    updater.idle()


if __name__ == '__main__':
    main()
add shebang 2018-01-31 12:32:25 +01:00			`#!/usr/bin/env python3`
add saving to hashtagged folders 2018-02-02 15:28:49 +01:00			`import errno`
Initial commit. 2018-01-30 13:47:33 +01:00			`import logging`
			`import os`
filetype detection 2018-01-31 14:34:59 +01:00			`import re`
yt-download to /tmp first 2018-01-31 14:11:42 +01:00			`import shutil`
attempt at error reporting numero deux 2018-01-31 12:30:08 +01:00			`import sys`
hotfix this ytdl merging bullshit 2018-01-31 14:23:01 +01:00			`from glob import glob`
Initial commit. 2018-01-30 13:47:33 +01:00
filetype detection 2018-01-31 14:34:59 +01:00			`import filetype`
add markov bullshit 2018-10-02 16:12:32 +02:00			`import markovify`
Initial commit. 2018-01-30 13:47:33 +01:00			`import requests`
			`import youtube_dl`
			`from telegram import MessageEntity`
			`from telegram.ext import Updater, CommandHandler, MessageHandler, Filters`

			`DIR = os.path.dirname(os.path.realpath(__file__))`
/tmp -> /var/tmp due to archlinux tmpfs 2018-02-04 21:28:58 +01:00			`TMP_DIR = '/var/tmp'`
Initial commit. 2018-01-30 13:47:33 +01:00			`OUT_DIR = DIR + '/out'`

different kind of logging 2018-01-31 15:58:56 +01:00			`logging.basicConfig(level=logging.INFO,`
log to file pls 2018-01-31 14:56:29 +01:00			`format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')`
			`logger = logging.getLogger("kunsax")`

minor cleanup 2018-10-02 16:15:29 +02:00			`markov = None`
add markov bullshit 2018-10-02 16:12:32 +02:00

			`def add_to_corpus(text):`
			`global markov`
			`text = text.lower()`
			`new_sentence = markovify.NewlineText(text)`
			`markov = markovify.combine([markov, new_sentence])`
			`with open("initial.txt", 'a') as f:`
			`f.write(text + '\n')`

Initial commit. 2018-01-30 13:47:33 +01:00
add saving to hashtagged folders 2018-02-02 15:28:49 +01:00			`def datestr(date):`
			`return date.strftime("%Y-%m-%d@%H%M")`


			`def mkdir_p(path):`
			`try:`
			`os.makedirs(path)`
			`except OSError as exc: # Python >2.5`
			`if exc.errno == errno.EEXIST and os.path.isdir(path):`
			`pass`
			`else:`
			`raise`


Initial commit. 2018-01-30 13:47:33 +01:00			`def ytdl_has(url):`
			`ies = youtube_dl.extractor.gen_extractors()`
			`for ie in ies:`
			`if ie.suitable(url) and ie.IE_NAME != 'generic' \`
			`and '/channel/' not in url:`
			`# Site has dedicated extractor`
			`return True`
			`return False`


grande refactor 2018-04-25 14:21:48 +02:00			`def download_ydl(urls, subdir, date, extract=False, filename=None):`
Initial commit. 2018-01-30 13:47:33 +01:00			`ydl_opts = {`
			`'noplaylist': True,`
hotfix this ytdl merging bullshit 2018-01-31 14:23:01 +01:00			`'restrictfilenames': True,`
remove format strings :( 2018-09-02 20:04:16 +02:00			`'outtmpl': TMP_DIR + '/' + datestr(date) + '__%(title)s__%(id)s.%(ext)s'`
Initial commit. 2018-01-30 13:47:33 +01:00			`}`
audio extraction capabilities 2018-02-02 16:03:25 +01:00			`if extract:`
			`ydl_opts['format'] = 'bestaudio'`
simply dl original audio on #audio (?) 2018-02-04 23:38:07 +01:00			`# ydl_opts['postprocessors'] = [{`
			`# 'key': 'FFmpegExtractAudio',`
			`# 'preferredcodec': 'wav'`
			`# }]`
Initial commit. 2018-01-30 13:47:33 +01:00			`with youtube_dl.YoutubeDL(ydl_opts) as ydl:`
add multiple URLs for ytdl; add manual mp4 to animations 2018-01-30 14:21:50 +01:00			`ydl.download(urls)`
remove format strings :( 2018-09-02 20:04:16 +02:00			`out_dir = OUT_DIR + '/' + subdir + '/'`
yt-download to /tmp first 2018-01-31 14:11:42 +01:00			`for filename in map(ydl.prepare_filename, map(ydl.extract_info, urls)):`
hotfix this ytdl merging bullshit 2018-01-31 14:23:01 +01:00			`globbeds = glob(os.path.splitext(filename)[0] + '.*')`
			`for globbed in globbeds:`
add saving to hashtagged folders 2018-02-02 15:28:49 +01:00			`logger.info("Moving %s to %s..." % (globbed, out_dir))`
			`shutil.move(globbed, out_dir)`
Initial commit. 2018-01-30 13:47:33 +01:00

grande refactor 2018-04-25 14:21:48 +02:00			`def download_raw(urls, subdir, date, extract=False, filename=None):`
			`for url in urls:`
remove format strings :( 2018-09-02 20:04:16 +02:00			`local_filename = OUT_DIR + '/' + subdir + '/' + "%s__%s" % (datestr(date), filename or url.split('/')[-1])`
grande refactor 2018-04-25 14:21:48 +02:00			`r = requests.get(url, stream=True)`
			`with open(local_filename, 'wb') as f:`
			`for chunk in r.iter_content(chunk_size=1024):`
			`if chunk:`
			`f.write(chunk)`
			`if not re.match(r'.*\..{3,5}$', os.path.split(local_filename)[-1]):`
			`kind = filetype.guess(local_filename)`
			`if kind is None:`
			`logger.error("File has no extension and could not be determined!")`
			`else:`
			`logger.info('Moving file without extension... %s?' % kind.extension)`
			`shutil.move(local_filename, local_filename + '.' + kind.extension)`
Initial commit. 2018-01-30 13:47:33 +01:00

hashtags before message too! 2018-04-25 14:30:08 +02:00			`last_hashtag = None`


add saving to hashtagged folders 2018-02-02 15:28:49 +01:00			`def get_first_hashtag(message):`
hashtags before message too! 2018-04-25 14:30:08 +02:00			`global last_hashtag`
fix it yes 2018-02-06 19:42:51 +01:00			`hashtags = list(map(message.parse_entity,`
			`list(filter(lambda e: e.type == 'hashtag', message.entities))))`
			`hashtags += list(map(message.parse_caption_entity,`
			`list(filter(lambda e: e.type == 'hashtag', message.caption_entities))))`
add saving to hashtagged folders 2018-02-02 15:28:49 +01:00			`if len(hashtags) == 0:`
hashtags before message too! 2018-04-25 14:30:08 +02:00			`if last_hashtag is not None and last_hashtag[0] == message.from_user:`
			`prehashtag = last_hashtag[1]`
			`last_hashtag = None`
			`else:`
			`return None`
			`else:`
			`prehashtag = hashtags[0]`
			`hashtag = prehashtag[1:].upper()`
PRAS 2018-04-25 13:55:52 +02:00			`if "PRAS" in hashtag:`
			`hashtag = "PRAS"`
			`return hashtag`
add saving to hashtagged folders 2018-02-02 15:28:49 +01:00

hashtags before message too! 2018-04-25 14:30:08 +02:00			`def handle_hashtag(bot, update):`
			`global last_hashtag`
			`hashtags = list(map(update.message.parse_entity,`
			`list(filter(lambda e: e.type == 'hashtag', update.message.entities))))`
			`if len(hashtags) > 0:`
			`last_hashtag = (update.message.from_user, hashtags[0])`


attempt at error reporting numero deux 2018-01-31 12:30:08 +01:00			`# noinspection PyBroadException`
grande refactor 2018-04-25 14:21:48 +02:00			`def handle(urls, message, download, filename=None):`
fix timeouts? 2018-04-26 18:03:28 +02:00			`try:`
			`hashtag = get_first_hashtag(message)`
Ignore by default, only dl w/ hashtag 2018-09-02 19:55:18 +02:00			`if hashtag is None:`
			`logger.info("Ignoring %s due to no hashtag present..." % urls)`
fix timeouts? 2018-04-26 18:03:28 +02:00			`return`

Ignore by default, only dl w/ hashtag 2018-09-02 19:55:18 +02:00			`logger.info("Downloading %s" % urls)`

fix timeouts? 2018-04-26 18:03:28 +02:00			`reply = 'Downloading'`
			`if hashtag:`
remove format strings :( 2018-09-02 20:04:16 +02:00			`mkdir_p(OUT_DIR + '/' + hashtag)`
			`reply += ' to "' + hashtag + '"'`
fix timeouts? 2018-04-26 18:03:28 +02:00			`reply += '...'`

			`if hashtag == 'AUDIO' and download != download_raw:`
			`reply += ' (And also guessing you want to extract the audio)'`
			`message.reply_text(reply)`
			`download(urls,`
			`hashtag or '.', message.date,`
			`extract=(hashtag == 'AUDIO'),`
			`filename=filename)`
			`except:`
			`_, exc_value, __ = sys.exc_info()`
			`if "Timed out" not in str(exc_value):`
			`message.reply_text("Something is FUCKED: %s" % exc_value)`
grande refactor 2018-04-25 14:21:48 +02:00

			`def handle_url(bot, update):`
			`ytdl_urls = list(filter(ytdl_has,`
			`map(lambda e: update.message.parse_entity(e),`
			`filter(lambda e: e.type == 'url',`
			`update.message.entities))))`
			`if len(ytdl_urls) > 0:`
			`handle(ytdl_urls, update.message, download_ydl)`
attempt at error reporting numero deux 2018-01-31 12:30:08 +01:00

			`# noinspection PyBroadException`
			`def handle_rest(bot, update):`
tweak filename 2018-02-05 09:39:38 +01:00			`file, filename = None, None`
fix photo detection 2018-01-31 13:42:16 +01:00			`if len(update.message.photo) > 0:`
attempt at error reporting numero deux 2018-01-31 12:30:08 +01:00			`photo = max(update.message.photo, key=lambda p: p.width)`
			`file = photo.file_id`
			`elif update.message.document is not None:`
tweak filename 2018-02-05 09:39:38 +01:00			`filename = update.message.document.file_name`
attempt at error reporting numero deux 2018-01-31 12:30:08 +01:00			`file = update.message.document.file_id`
			`elif update.message.audio is not None:`
tweak filename 2018-02-05 09:39:38 +01:00			`filename = update.message.audio.title`
attempt at error reporting numero deux 2018-01-31 12:30:08 +01:00			`file = update.message.audio.file_id`
			`elif update.message.video is not None:`
			`file = update.message.video.file_id`
update deps, add support for video notes 2018-10-02 13:36:23 +02:00			`elif update.message.video_note is not None:`
			`file = update.message.video_note.file_id`
capture also voice messages 2018-02-03 15:44:01 +01:00			`elif update.message.voice is not None:`
			`file = update.message.voice.file_id`
update deps, add support for video notes 2018-10-02 13:36:23 +02:00
attempt at error reporting numero deux 2018-01-31 12:30:08 +01:00			`if file is not None:`
grande refactor 2018-04-25 14:21:48 +02:00			`url = bot.getFile(file).file_path`
			`handle([url], update.message, download_raw, filename=filename)`
Initial commit. 2018-01-30 13:47:33 +01:00

add markov bullshit 2018-10-02 16:12:32 +02:00			`def handle_text(bot, update):`
			`add_to_corpus(update.message.text)`


Initial commit. 2018-01-30 13:47:33 +01:00			`def start(bot, update):`
add markov bullshit 2018-10-02 16:12:32 +02:00			`update.message.reply_text(markov.make_sentence())`
Initial commit. 2018-01-30 13:47:33 +01:00

			`def error(bot, update, error):`
			`logger.error(error)`
maybe fix error? 2018-09-02 20:06:27 +02:00			`if "Timed out" in str(error):`
fix timeouts? 2018-04-26 18:03:28 +02:00			`if update is not None:`
add markov bullshit 2018-10-02 16:12:32 +02:00			`update.message.reply_text(markov.make_sentence(tries=100) or "Mmmm, I like it...")`
fix error in error handler 2018-09-02 19:59:11 +02:00			`handle_rest(bot, update)`
fix timeouts? 2018-04-26 18:03:28 +02:00			`else:`
			`if update is not None:`
			`update.message.reply_text("Something is fucked: %s" % error)`
Initial commit. 2018-01-30 13:47:33 +01:00

			`def main():`
add markov bullshit 2018-10-02 16:12:32 +02:00			`global markov`

			`with open("initial.txt") as f:`
			`text = f.read()`
			`markov = markovify.NewlineText(text.lower())`
more cleanups 2018-10-02 16:16:30 +02:00			`logger.info("Sentence of the day: " + markov.make_sentence())`
add markov bullshit 2018-10-02 16:12:32 +02:00
Initial commit. 2018-01-30 13:47:33 +01:00			`updater = Updater("*REMOVED*")`

			`dp = updater.dispatcher`

			`dp.add_handler(CommandHandler("start", start))`

			`dp.add_error_handler(error)`

			`dp.add_handler(MessageHandler(Filters.entity(MessageEntity.URL), handle_url))`
tweak filename 2018-02-05 09:39:38 +01:00			`dp.add_handler(`
update deps, add support for video notes 2018-10-02 13:36:23 +02:00			`MessageHandler(`
			`Filters.photo \| Filters.video \| Filters.video_note \| Filters.audio \| Filters.voice \| Filters.document,`
			`handle_rest))`
hashtags before message too! 2018-04-25 14:30:08 +02:00			`dp.add_handler(MessageHandler(Filters.entity(MessageEntity.HASHTAG), handle_hashtag))`
add markov bullshit 2018-10-02 16:12:32 +02:00			`dp.add_handler(MessageHandler(Filters.text, handle_text))`
Initial commit. 2018-01-30 13:47:33 +01:00
			`updater.start_polling()`

			`logger.info("Started Telegram bot...")`

			`updater.idle()`


			`if __name__ == '__main__':`
			`main()`