delojza/delojza.py

281 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
import errno
import logging
import os
import re
import shutil
import sys
from configparser import ConfigParser
from glob import glob
import filetype
import markovify
import pytumblr
import requests
import youtube_dl
from telegram import MessageEntity
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def datestr(date):
return date.strftime("%Y-%m-%d@%H%M")
class DelojzaBot:
def __init__(self, tg_api_key, out_dir, tmp_dir='/var/tmp', tumblr_keys=None, markov=None):
self.logger = logging.getLogger("kunsax")
self.out_dir = out_dir
self.logger.debug('OUT_DIR: ' + out_dir)
self.tmp_dir = tmp_dir
self.logger.debug('TMP_DIR: ' + tmp_dir)
self.markov = markov
self.updater = Updater(tg_api_key)
dp = self.updater.dispatcher
dp.add_handler(CommandHandler("start", self.tg_start))
dp.add_error_handler(self.tg_error)
dp.add_handler(MessageHandler(Filters.entity(MessageEntity.URL), self.handle_url))
dp.add_handler(
MessageHandler(
Filters.photo | Filters.video | Filters.video_note | Filters.audio | Filters.voice | Filters.document,
self.handle_rest))
dp.add_handler(MessageHandler(Filters.entity(MessageEntity.HASHTAG), self.handle_hashtag))
dp.add_handler(MessageHandler(Filters.text, self.handle_text))
if tumblr_keys:
self.client = pytumblr.TumblrRestClient(*tumblr_keys)
self.last_hashtag = None
@staticmethod
def ytdl_can(url):
ies = youtube_dl.extractor.gen_extractors()
for ie in ies:
if ie.suitable(url) and ie.IE_NAME != 'generic' \
and '/channel/' not in url:
# Site has dedicated extractor
return True
return False
def download_ytdl(self, urls, subdir, date, extract=False, filename=None):
ydl_opts = {
'noplaylist': True,
'restrictfilenames': True,
'outtmpl': os.path.join(self.tmp_dir, datestr(date), '__%(title)s__%(id)s.%(ext)s') # HOW?
}
if extract:
ydl_opts['format'] = 'bestaudio'
# ydl_opts['postprocessors'] = [{
# 'key': 'FFmpegExtractAudio',
# 'preferredcodec': 'wav'
# }]
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download(urls)
out_dir = os.path.join(self.out_dir, subdir)
for filename in map(ydl.prepare_filename, map(ydl.extract_info, urls)):
globbeds = glob(os.path.splitext(filename)[0] + '.*')
for globbed in globbeds:
self.logger.info("Moving %s to %s..." % (globbed, out_dir))
shutil.move(globbed, out_dir)
return []
def download_raw(self, urls, subdir, date, extract=False, filename=None):
filenames = []
for url in urls:
local_filename = os.path.join(self.out_dir, subdir,
"%s__%s" % (datestr(date), filename or url.split('/')[-1]))
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
if not re.match(r'.*\..{3,5}$', os.path.split(local_filename)[-1]):
kind = filetype.guess(local_filename)
if kind is None:
self.logger.error("File has no extension and could not be determined!")
else:
self.logger.info('Moving file without extension... %s?' % kind.extension)
shutil.move(local_filename, local_filename + '.' + kind.extension)
filenames.append(local_filename)
return filenames
def get_first_hashtag(self, message):
hashtags = list(map(message.parse_entity,
list(filter(lambda e: e.type == 'hashtag', message.entities))))
hashtags += list(map(message.parse_caption_entity,
list(filter(lambda e: e.type == 'hashtag', message.caption_entities))))
if len(hashtags) == 0:
if self.last_hashtag is not None and self.last_hashtag[0] == message.from_user:
prehashtag = self.last_hashtag[1]
self.last_hashtag = None
else:
return None
else:
prehashtag = hashtags[0]
hashtag = prehashtag[1:].upper()
if "PRAS" in hashtag:
hashtag = "PRAS"
return hashtag
def handle_hashtag(self, _, update):
hashtags = list(map(update.message.parse_entity,
list(filter(lambda e: e.type == 'hashtag', update.message.entities))))
if len(hashtags) > 0:
self.last_hashtag = (update.message.from_user, hashtags[0])
# noinspection PyBroadException
def handle(self, urls, message, download_fn, filename=None):
try:
hashtag = self.get_first_hashtag(message)
if hashtag is None:
self.logger.info("Ignoring %s due to no hashtag present..." % urls)
return
self.logger.info("Downloading %s under '%s'" % (urls, hashtag))
reply = 'Downloading'
if hashtag:
mkdir_p(os.path.join(self.out_dir, hashtag))
reply += ' to "' + hashtag + '"'
reply += '...'
extract = False
if hashtag in ('AUDIO', 'RADIO') and download_fn != self.download_raw:
extract = True
reply += ' (And also guessing you want to extract the audio)'
message.reply_text(reply)
filenames = download_fn(urls, hashtag or '.', message.date,
extract=extract, filename=filename)
if hashtag == 'TUMBLR':
message.reply_text('(btw, queueing to tumblr)')
for filename in filenames:
self.client.create_photo('kunsaxan', state="queue", data=filename)
return filenames
except:
_, exc_value, __ = sys.exc_info()
if "Timed out" not in str(exc_value):
message.reply_text("Something is FUCKED: %s" % exc_value)
def handle_url(self, _, update):
urls = list(map(lambda e: update.message.parse_entity(e),
filter(lambda e: e.type == 'url', update.message.entities)))
ytdl_urls = [url for url in urls if self.ytdl_can(url)]
normal_urls = [url for url in urls if not self.ytdl_can(url)]
if len(ytdl_urls) > 0:
self.handle(ytdl_urls, update.message, self.download_ytdl)
if len(normal_urls) > 0:
image_urls = [url for url in normal_urls if "image" in requests.head(url).headers.get("Content-Type", "")]
if len(image_urls) > 0:
self.handle(image_urls, update.message, self.download_raw)
# noinspection PyBroadException
def handle_rest(self, bot, update):
file, filename, tumblr = None, None, False
if len(update.message.photo) > 0:
photo = max(update.message.photo, key=lambda p: p.width)
file = photo.file_id
elif update.message.document is not None:
filename = update.message.document.file_name
file = update.message.document.file_id
elif update.message.audio is not None:
filename = update.message.audio.title
file = update.message.audio.file_id
elif update.message.video is not None:
file = update.message.video.file_id
elif update.message.video_note is not None:
file = update.message.video_note.file_id
elif update.message.voice is not None:
file = update.message.voice.file_id
if file is not None:
url = bot.getFile(file).file_path
self.handle([url], update.message, self.download_raw, filename=filename)
def handle_text(self, _, update):
self.markov.add_to_corpus(update.message.text)
def tg_start(self, _, update):
update.message.reply_text(self.markov.make_sentence())
def tg_error(self, bot, update, error):
self.logger.error(error)
if "Timed out" in str(error):
if update is not None:
update.message.reply_text(self.markov.make_sentence(tries=100) or "Mmmm, I like it...")
self.handle_rest(bot, update)
else:
if update is not None:
update.message.reply_text("Something is fucked: %s" % error)
def run_idle(self):
self.updater.start_polling()
self.logger.info("Started Telegram bot...")
self.updater.idle()
class MarkovBlabberer:
def __init__(self, filepath):
self.logger = logging.getLogger('markov')
self.filepath = filepath
with open(filepath) as f:
text = f.read()
self.markov = markovify.NewlineText(text.lower())
self.logger.info("Sentence of the day: " + self.make_sentence())
def make_sentence(self, tries=100):
return self.markov.make_sentence(tries=tries)
def add_to_corpus(self, text):
text = text.lower()
new_sentence = markovify.NewlineText(text)
self.markov = markovify.combine([self.markov, new_sentence])
with open(self.filepath, 'a') as f:
f.write(text + '\n')
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
_DIR_ = os.path.dirname(os.path.realpath(__file__))
CONFIG_PATHS = ['/etc/delojza/delojza.ini',
os.path.join(os.getenv("HOME"), ".config/delojza/delojza.ini"),
os.path.join(_DIR_, "delojza.ini")]
config = ConfigParser()
try:
CONF_FILE = next(conf_path for conf_path in CONFIG_PATHS if os.path.isfile(conf_path))
config.read(CONF_FILE)
except StopIteration:
logging.error("No config file found, stopping.")
sys.exit(-1)
markov = MarkovBlabberer("initial.txt")
delojza = DelojzaBot(config.get('delojza', 'tg_api_key'),
config.get('delojza', 'OUT_DIR', fallback=os.path.join(_DIR_, "out")),
tmp_dir=config.get('delojza', 'tmp_dir', fallback="/var/tmp"),
tumblr_keys=(config.get('tumblr', 'consumer_key'),
config.get('tumblr', 'consumer_secret'),
config.get('tumblr', 'oauth_key'),
config.get('tumblr', 'oauth_secret')),
markov=None)
delojza.run_idle()