delojza/delojza.py

394 lines
17 KiB
Python
Raw Normal View History

2018-01-31 12:32:25 +01:00
#!/usr/bin/env python3
2018-02-02 15:28:49 +01:00
import errno
2018-01-30 13:47:33 +01:00
import logging
import os
2018-01-31 14:34:59 +01:00
import re
2018-01-31 14:11:42 +01:00
import shutil
2018-01-31 12:30:08 +01:00
import sys
from configparser import ConfigParser
from datetime import datetime
2018-01-31 14:23:01 +01:00
from glob import glob
from operator import itemgetter
2018-01-30 13:47:33 +01:00
2018-01-31 14:34:59 +01:00
import filetype
2018-10-02 16:12:32 +02:00
import markovify
2019-05-01 10:50:21 +02:00
import mutagen.id3
2018-10-28 14:00:25 +01:00
import pytumblr
2018-01-30 13:47:33 +01:00
import requests
import telegram
2018-01-30 13:47:33 +01:00
import youtube_dl
from telegram import MessageEntity
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
from youtube_dl.version import __version__ as YTDL_VERSION
2018-01-30 13:47:33 +01:00
2018-02-02 15:28:49 +01:00
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
2018-02-02 15:28:49 +01:00
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def datestr(date):
return date.strftime("%Y-%m-%d@%H%M")
2018-04-25 14:30:08 +02:00
class DelojzaBot:
def __init__(self, tg_api_key, out_dir, tmp_dir='/var/tmp', tumblr_keys=None, markov=None):
2019-05-01 10:39:29 +02:00
self.logger = logging.getLogger("delojza")
self.out_dir = out_dir
self.logger.debug('OUT_DIR: ' + out_dir)
self.tmp_dir = tmp_dir
self.logger.debug('TMP_DIR: ' + tmp_dir)
self.markov = markov
self.updater = Updater(tg_api_key)
dp = self.updater.dispatcher
dp.add_handler(CommandHandler("start", self.tg_start))
dp.add_error_handler(self.tg_error)
2019-04-18 17:40:48 +02:00
self.tg_url_handler = MessageHandler(Filters.entity(MessageEntity.URL), self.tg_handle_url)
dp.add_handler(self.tg_url_handler)
self.tg_rest_handler = MessageHandler(Filters.photo | Filters.video | Filters.video_note |
Filters.audio | Filters.voice | Filters.document, self.tg_handle_rest)
dp.add_handler(self.tg_rest_handler)
dp.add_handler(MessageHandler(Filters.entity(MessageEntity.HASHTAG), self.tg_handle_hashtag))
dp.add_handler(MessageHandler(Filters.text, self.tg_handle_text))
dp.add_handler(CommandHandler("stats", self.tg_stats))
dp.add_handler(CommandHandler("orphans", self.tg_orphan))
dp.add_handler(CommandHandler("orphans_full", self.tg_orphan_full))
dp.add_handler(CommandHandler("version", self.tg_version))
if tumblr_keys:
self.client = pytumblr.TumblrRestClient(*tumblr_keys)
self.last_hashtag = None
@staticmethod
def ytdl_can(url):
ies = youtube_dl.extractor.gen_extractors()
for ie in ies:
if ie.suitable(url) and ie.IE_NAME != 'generic' \
and '/channel/' not in url:
# Site has dedicated extractor
return True
return False
2019-05-01 10:50:21 +02:00
def download_ytdl(self, urls, subdir, date, message, extract=False, filename=None):
ydl_opts = {
'noplaylist': True,
'restrictfilenames': True,
2019-05-01 10:50:51 +02:00
'outtmpl': os.path.join(self.tmp_dir, '{}__%(title)s__%(id)s.%(ext)s'.format(datestr(date)))
}
if extract:
ydl_opts['format'] = 'bestaudio/best'
2019-04-18 16:37:27 +02:00
ydl_opts['postprocessors'] = [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '256'
}]
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download(urls)
out_dir = os.path.join(self.out_dir, subdir)
2019-05-01 10:50:21 +02:00
for info in [ydl.extract_info(url, download=False) for url in urls]:
filename = ydl.prepare_filename(info)
globbeds = glob(os.path.splitext(filename)[0] + '.*')
for globbed in globbeds:
2019-05-01 10:50:21 +02:00
if globbed.endswith("mp3"):
title = info['track'] or (info['title'].split("-")[1]
if "-" in info['title'] else info['title'])
title = title.strip()
artist = info['artist'] or (info['title'].split("-")[0]
if "-" in info['title'] else info['title'])
artist = artist.strip()
message.reply_text("Tagging as \"{}\" by \"{}\"".format(title, artist))
self.logger.info("Tagging %s w/ $s - $s...".format(globbed, title, artist))
id3 = mutagen.id3.ID3(globbed)
id3.add(mutagen.id3.TIT2(encoding=3, text=title))
if artist and (not info['artist'] and artist != title):
id3.add(mutagen.id3.TOPE(encoding=3, text=artist))
id3.add(mutagen.id3.TPE1(encoding=3, text=artist))
id3.save()
self.logger.info("Moving %s to %s..." % (globbed, out_dir))
shutil.move(globbed, out_dir)
return []
2019-05-01 10:50:21 +02:00
def download_raw(self, urls, subdir, date, _, extract=False, filename=None):
filenames = []
for url in urls:
local_filename = os.path.join(self.out_dir, subdir,
"%s__%s" % (datestr(date), filename or url.split('/')[-1]))
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
if not re.match(r'.*\..{3,5}$', os.path.split(local_filename)[-1]):
kind = filetype.guess(local_filename)
if kind is None:
self.logger.error("File has no extension and could not be determined!")
else:
self.logger.info('Moving file without extension... %s?' % kind.extension)
shutil.move(local_filename, local_filename + '.' + kind.extension)
filenames.append(local_filename)
2018-10-28 14:08:57 +01:00
return filenames
2018-01-30 13:47:33 +01:00
2019-04-18 17:40:48 +02:00
@staticmethod
def extract_first_hashtag(message):
hashtags = list(map(message.parse_entity,
list(filter(lambda e: e.type == 'hashtag', message.entities))))
hashtags += list(map(message.parse_caption_entity,
list(filter(lambda e: e.type == 'hashtag', message.caption_entities))))
2019-04-18 17:40:48 +02:00
if len(hashtags) > 0:
hashtag = hashtags[0][1:].upper()
if "PRAS" in hashtag:
hashtag = "PRAS"
return hashtag
def get_hashtag(self, message):
hashtag = self.extract_first_hashtag(message)
if hashtag is None:
if self.last_hashtag is not None and self.last_hashtag[0] == message.from_user:
2019-04-18 17:40:48 +02:00
hashtag = self.last_hashtag[1]
2019-05-01 10:53:48 +02:00
self.last_hashtag = None
return hashtag
2019-04-18 17:40:48 +02:00
def tg_handle_hashtag(self, bot, update):
hashtag = self.extract_first_hashtag(update.message)
if update.message.reply_to_message:
self.handle_tg_message(update.message.reply_to_message, bot, hashtag)
self.handle_urls(update.message.reply_to_message, hashtag)
else:
self.last_hashtag = (update.message.from_user, hashtag)
# noinspection PyBroadException
2019-04-18 17:40:48 +02:00
def handle(self, urls, message, hashtag, download_fn, filename=None):
try:
if hashtag is None:
self.logger.info("Ignoring %s due to no hashtag present..." % urls)
return
self.logger.info("Downloading %s under '%s'" % (urls, hashtag))
reply = 'Downloading'
if hashtag:
mkdir_p(os.path.join(self.out_dir, hashtag))
reply += ' to "' + hashtag + '"'
reply += '...'
extract = False
if any([tag in hashtag for tag in ('AUDIO', 'RADIO')]) and download_fn != self.download_raw:
extract = True
reply += ' (And also guessing you want to extract the audio)'
2019-04-18 12:54:10 +02:00
message.reply_text(reply)
2019-04-18 12:54:10 +02:00
2019-05-01 10:50:21 +02:00
filenames = download_fn(urls, hashtag or '.', message.date, message, extract=extract, filename=filename)
2019-04-18 16:35:40 +02:00
if hashtag == 'TUMBLR' and self.client:
message.reply_text('(btw, queueing to tumblr)')
for filename in filenames:
self.client.create_photo('kunsaxan', state="queue", data=filename)
elif hashtag == 'TUMBLR_NOW' and self.client:
message.reply_text('(btw, ***FIRING TO TUMBLR RIGHT AWAY***)',
parse_mode=telegram.ParseMode.MARKDOWN)
for filename in filenames:
self.client.create_photo('kunsaxan', state="published", data=filename)
return filenames
except:
_, exc_value, __ = sys.exc_info()
if "Timed out" not in str(exc_value):
message.reply_text("Something is FUCKED: %s" % exc_value)
2019-04-18 16:34:56 +02:00
def tg_handle_url(self, _, update):
2019-04-18 17:40:48 +02:00
self.handle_urls(update.message, self.get_hashtag(update.message))
def handle_urls(self, message, hashtag):
urls = list(map(lambda e: message.parse_entity(e),
filter(lambda e: e.type == 'url', message.entities)))
2019-04-18 13:02:37 +02:00
ytdl_urls = [url for url in urls if self.ytdl_can(url)]
normal_urls = [url for url in urls if not self.ytdl_can(url)]
if len(ytdl_urls) > 0:
2019-04-18 17:40:48 +02:00
self.handle(ytdl_urls, message, hashtag, self.download_ytdl)
2019-04-18 13:02:37 +02:00
if len(normal_urls) > 0:
image_urls = [url for url in normal_urls if "image" in requests.head(url).headers.get("Content-Type", "")]
if len(image_urls) > 0:
2019-04-18 17:40:48 +02:00
self.handle(image_urls, message, hashtag, self.download_raw)
# noinspection PyBroadException
2019-04-18 16:34:56 +02:00
def tg_handle_rest(self, bot, update):
2019-04-18 17:40:48 +02:00
self.handle_tg_message(update.message, bot, self.get_hashtag(update.message))
def handle_tg_message(self, message, bot, hashtag):
file, filename, tumblr = None, None, False
2019-04-18 17:40:48 +02:00
if len(message.photo) > 0:
photo = max(message.photo, key=lambda p: p.width)
file = photo.file_id
2019-04-18 17:40:48 +02:00
elif message.document is not None:
filename = message.document.file_name
file = message.document.file_id
elif message.audio is not None:
filename = message.audio.title
file = message.audio.file_id
elif message.video is not None:
file = message.video.file_id
elif message.video_note is not None:
file = message.video_note.file_id
elif message.voice is not None:
file = message.voice.file_id
if file is not None:
url = bot.getFile(file).file_path
2019-04-18 17:40:48 +02:00
self.handle([url], message, hashtag, self.download_raw, filename=filename)
2019-04-18 16:34:56 +02:00
def tg_handle_text(self, _, update):
2019-04-18 16:11:57 +02:00
if self.markov:
self.markov.add_to_corpus(update.message.text)
def tag_dirs(self):
return list(filter(lambda x: x.upper() == x,
filter(lambda dir: os.path.isdir(os.path.join(self.out_dir, dir)),
os.listdir(self.out_dir))))
def tg_stats(self, _, update):
tag_dirs = self.tag_dirs()
reply = "Total number of tags: {}\n\n".format(len(tag_dirs))
counts = [(dir, os.listdir(os.path.join(self.out_dir, dir))) for dir in tag_dirs]
counts.sort(key=itemgetter(0))
counts.sort(key=lambda x: len(x[1]), reverse=True)
for dir, files in counts:
if len(files) == 1:
break
abs_paths = [os.path.join(self.out_dir, dir, file) for file in files]
abs_files = list(filter(os.path.isfile, abs_paths))
audio_cnt = len([match for match in map(filetype.audio, abs_files) if match is not None])
video_cnt = len([match for match in map(filetype.video, abs_files) if match is not None])
image_cnt = len([match for match in map(filetype.image, abs_files) if match is not None])
rest_cnt = len(files) - audio_cnt - video_cnt - image_cnt
dir_cnt = len(abs_paths) - len(abs_files)
details = ", ".join(["{} {}".format(cnt, desc) for cnt, desc in
[(image_cnt, "images"), (video_cnt, "videos"), (audio_cnt, "audios"),
(rest_cnt, "unknown files"), (dir_cnt, "directories")] if cnt > 0])
if any([len(abs_paths) == cnt for cnt in [audio_cnt, video_cnt, image_cnt, rest_cnt, dir_cnt]]):
reply += "<b>{}:</b> {}\n".format(dir, details)
else:
reply += "<b>{}:</b> {} files ({})\n".format(dir, len(files), details)
orphans = list(filter(lambda cnt: len(cnt[1]) <= 1, counts))
if len(orphans) > 0:
reply += "\nFollowing tags are orphans: " + ", ".join(map(itemgetter(0), orphans))
update.message.reply_text(reply, parse_mode=telegram.ParseMode.HTML)
def orphans(self):
result = []
tag_dirs = self.tag_dirs()
for dir in tag_dirs:
files = os.listdir(os.path.join(self.out_dir, dir))
if len(files) == 1:
result.append((dir, files[0]))
if len(files) == 0:
result.append((dir, "NO FILE AT ALL..."))
return sorted(result, key=itemgetter(0))
def tg_orphan(self, _, update):
orphans = self.orphans()
if len(orphans) == 0:
update.message.reply_text("Good job, no orphan tags!")
else:
update.message.reply_text("The following tags only contain a single file:\n" +
", ".join(map(itemgetter(0), orphans)))
def tg_orphan_full(self, _, update):
orphans = self.orphans()
if len(orphans) == 0:
update.message.reply_text("Good job, no orphan tags!")
else:
tmp_reply = "The following tags only contain a single file:\n"
for dir, file in orphans:
line = "{}: {}\n".format(dir, file)
if len(tmp_reply + line) > 4096:
update.message.reply_text(tmp_reply)
tmp_reply = ""
tmp_reply += line
if len(tmp_reply) > 0:
update.message.reply_text(tmp_reply)
def tg_version(self, _, update):
delojza_date = datetime.fromtimestamp(os.path.getmtime(os.path.realpath(__file__))) \
.strftime('%Y/%m/%d - %H:%M:%S')
update.message.reply_text("delojza modified date: {}\nyoutube-dl version: {}"
.format(delojza_date, YTDL_VERSION))
2019-04-18 12:54:10 +02:00
def tg_start(self, _, update):
2019-04-18 16:11:57 +02:00
update.message.reply_text(self.markov.make_sentence() if self.markov else "HELLO")
def tg_error(self, bot, update, error):
self.logger.error(error)
if "Timed out" in str(error):
if update is not None:
2019-04-18 16:11:57 +02:00
default = "Mmmm, I like it..."
update.message.reply_text((self.markov.make_sentence(tries=100) if self.markov else default) or default)
2019-04-18 16:34:56 +02:00
self.tg_handle_rest(bot, update)
else:
if update is not None:
update.message.reply_text("Something is fucked: %s" % error)
2018-01-30 13:47:33 +01:00
def run_idle(self):
self.updater.start_polling()
self.logger.info("Started Telegram bot...")
self.updater.idle()
2018-01-30 13:47:33 +01:00
class MarkovBlabberer:
def __init__(self, filepath):
self.logger = logging.getLogger('markov')
self.filepath = filepath
2018-01-30 13:47:33 +01:00
with open(filepath) as f:
text = f.read()
self.markov = markovify.NewlineText(text.lower())
self.logger.info("Sentence of the day: " + self.make_sentence())
2018-01-30 13:47:33 +01:00
def make_sentence(self, tries=100):
return self.markov.make_sentence(tries=tries)
2018-01-30 13:47:33 +01:00
def add_to_corpus(self, text):
text = text.lower()
new_sentence = markovify.NewlineText(text)
self.markov = markovify.combine([self.markov, new_sentence])
with open(self.filepath, 'a') as f:
f.write(text + '\n')
2018-01-30 13:47:33 +01:00
if __name__ == '__main__':
2019-04-18 16:19:21 +02:00
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
2018-01-30 13:47:33 +01:00
_DIR_ = os.path.dirname(os.path.realpath(__file__))
CONFIG_PATHS = ['/etc/delojza/delojza.ini',
2019-04-18 13:12:16 +02:00
os.path.join(os.getenv("HOME") or "", ".config/delojza/delojza.ini"),
os.path.join(_DIR_, "delojza.ini")]
2018-01-30 13:47:33 +01:00
config = ConfigParser()
try:
CONF_FILE = next(conf_path for conf_path in CONFIG_PATHS if os.path.isfile(conf_path))
config.read(CONF_FILE)
except StopIteration:
logging.error("No config file found, stopping.")
sys.exit(-1)
markov = MarkovBlabberer("initial.txt")
delojza = DelojzaBot(config.get('delojza', 'tg_api_key'),
config.get('delojza', 'OUT_DIR', fallback=os.path.join(_DIR_, "out")),
tmp_dir=config.get('delojza', 'tmp_dir', fallback="/var/tmp"),
tumblr_keys=(config.get('tumblr', 'consumer_key'),
config.get('tumblr', 'consumer_secret'),
config.get('tumblr', 'oauth_key'),
config.get('tumblr', 'oauth_secret')),
2019-04-18 13:14:28 +02:00
markov=markov)
delojza.run_idle()