diff bin/moefetch @ 311:dd2ddddf00d5

Merge.
author Edho Arief <edho@myconan.net>
date Wed, 07 Mar 2012 14:17:51 +0700
parents 21b86001b0c5
children 110d50856dde
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/moefetch	Wed Mar 07 14:17:51 2012 +0700
@@ -0,0 +1,428 @@
+#!/bin/sh
+
+# Copyright (c) 2009-2012, edogawaconan <edho@myconan.net>
+# 
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+#
+# Lots of bugs here. Use with care
+# USE WITH CARE
+#
+# what it does: fetch every picture that has the specified TAGS.
+# requirement: wget, libxslt, openssl
+
+# program additional paths for: cut, sed, wc, openssl, wget, xsltproc, grep
+ADDITIONAL_PATH=
+
+# default server address. Danbooru only! I do not take responsibility of stupidity.
+DEFAULT_SITE="moe.imouto.org"
+
+# base directory. make sure it's writeable. I do not take responsibility if you don't own the folder and files as no check is done for this one.
+# Structure is ${BASE_DIR}/<TAGS>
+# Absolute path only.
+# Leave empty to use whatever folder you're running this at
+BASE_DIR=
+
+# not user modifiable from here
+
+# stop on any error
+set -e
+# ensures all variables initialized
+set -u
+useragent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0) Gecko/20100101 Firefox/10.0"
+
+# useless welcome message. Also version
+msg_welcome() {
+	echo "moefetch ${_version}
+Copyright (c) 2009-2012 edogawaconan <edho@myconan.net>
+"
+}
+
+# Sanitize path. Totally safe. Usage: cmd "$(safe_path "${filename}")"
+safe_path()
+{
+	# It all depends on the first character.
+	start=$(printf "%s" "$*" | cut -c 1)
+	path=
+	case "${start}" in
+		.|/) path="$*";; # . and / is safe. No change.
+		*) path="./$*";; # Anything else must be prefixed with ./
+	esac
+	printf "%s" "${path}" # Return.
+}
+
+# Checks md5. OpenSSL should be available on anything usable.
+get_md5() { cat "$(safe_path "${1}")" | openssl dgst -md5 | tail -n 1 | sed -e 's/.*\([[:xdigit:]]\{32\}\).*/\1/'; }
+
+# Safely get basename.
+get_basename() { basename "$(safe_path "${1}")"; }
+
+# Safely get filename (basename without the extension).
+get_filename() { get_basename "${1%.*}"; }
+
+# Transformation for tag url.
+get_cleantags() { printf "%s " "$*" | sed -e 's/\&/%26/g;s/=/%3D/g'; }
+
+# Returns something if not an md5 value.
+is_not_md5() { get_filename "$1" | sed -e 's/\([0-9a-f]\{32\}\)//g'; }
+
+
+# fatal error handler
+Err_Fatal() {
+	echo "
+Fatal error: ${1}"
+	exit 1
+}
+
+Err_Impossible() {
+	echo "
+Impossible error. Or you modified content of the working directories when the script is running.
+Please report to moefetch.googlecode.com if you see this message (complete with entire run log)"
+	exit 1
+}
+
+# help message
+Err_Help() {
+	echo "moefetch.sh COMMAND [-n] [-p PASSWORD] [-s SITE_URL] [-u USERNAME] TAGS
+
+COMMAND: 
+	(quick)fetch:
+		Do a complete update. Add prefix quick to skip file checking
+	check:
+		Get list of new files, clean up local folder and print total new files
+
+OPTIONS:
+	-n:
+		Skip checking repository directory.
+	-p PASSWORD:
+		Specifies password for login.
+	-s SITE_URL: 
+		Specify URL of the Danbooru powered site you want to leech from. Default is ${DEFAULT_SITE}.
+	-u USERNAME:
+		Specifies username for login.
+	TAGS: 
+		Tags you want to download. Separated by spaces. Tag name follows standard Danbooru tagging scheme."
+	exit 2
+}
+
+# generate link by transforming xml
+Generate_Link() {
+	echo "
+Fetching XML file"
+	tempnum=1000
+	iternum=1
+	> "${TEMP_PREFIX}-list"
+	while [ "${tempnum}" -ge 1000 ]; do
+		url="http://${SITE}/post/index.xml?tags=$(get_cleantags "${TAGS}")&offset=0&limit=1000&page=${iternum}"
+		[ ${_use_login} -eq 1 ] && url="${url}&login=${LOGIN_USER}&password_hash=${LOGIN_PASS}"
+		wget --quiet "${url}" -O "${TEMP_PREFIX}-xml" --referer="http://${SITE}/post" --user-agent="${useragent}" -e continue=off || Err_Fatal "Failed download catalog file"
+		printf "Processing XML file... "
+		# xslt evilry
+		xsltproc - "${TEMP_PREFIX}-xml" <<EOF | sed 's/.*\(http.*\)\(\/[a-f0-9]\{32\}\).*\.\([^\.]*\)/\1\2.\3/g' | grep ^http > "${TEMP_PREFIX}-templist"
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+<xsl:output method="xml" indent="yes"/>
+<xsl:template match="post">
+<xsl:value-of select="@file_url" />
+</xsl:template>
+</xsl:stylesheet>
+EOF
+		tempnum=$(grep -c . "${TEMP_PREFIX}-templist")
+		iternum=$((iternum + 1))
+		cat "${TEMP_PREFIX}-templist" >> "${TEMP_PREFIX}-list"
+		echo "${tempnum} file(s) available"
+	done
+	numfiles=$(grep -c . "${TEMP_PREFIX}-list")
+	echo "${numfiles} file(s) available on server"
+	[ "${numfiles}" -gt 0 ] || Err_Fatal "Error in processing list or no files can be found with specified tag(s) or site."
+}
+
+
+progress_init() {
+	_last="-"
+	printf "${_last}"
+}
+
+progress_anim() {
+	case "${_last}" in
+		/) _last="-";;
+		-) _last=\\;;
+		\\) _last=\|;;
+		\|) _last="/";;
+	esac
+	printf "\b${_last}"
+}
+
+progress_done() { printf "\bdone\n"; }
+
+# getting rid of ls (as per suggestion)
+Count_Files() {
+	numfiles=0
+	for dircontent in "${*}/"* "${*}/".*; do 
+		if [ -e "${dircontent}" ] && [ x"${dircontent}" != x"${*}/." ] && [ x"${dircontent}" != x"${*}/.." ]; then 
+			numfiles=$((numfiles + 1))
+		fi
+	done
+	echo $((numfiles))
+}
+
+# check tools availability
+Check_Tools() {
+	# verify all programs required do indeed exist
+	commands="cut sed wc wget xsltproc xargs rm mkdir chown comm grep date openssl"
+	for cmd in ${commands}
+	do
+		 [ "$(command -v "${cmd}")" ] || Err_Fatal "${cmd} doesn't exist in ${PATH}"
+	done
+}
+
+# verify required folders exist and writeable
+Check_Folders(){
+	[ -O "${BASE_DIR}" ] || Err_Fatal "You don't own ${BASE_DIR}. Please fix ${BASE_DIR} or run this script in your own directory."
+	for directory in temp trash deleted "${SITE_DIR}/${TARGET_DIR}"; do
+		if [ ! -d "${BASE_DIR}/${directory}" ]; then
+			mkdir -p "${BASE_DIR}/${directory}" || Err_Impossible
+		fi
+		if [ ! -O "${BASE_DIR}/${directory}" ]; then
+			echo "You don't own the ${BASE_DIR}/${directory}, applying globally writeable permission on it"
+			chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible
+		fi
+	done
+	[ "$(Count_Files "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}")" -eq 0 ] && ISNEW=1
+	for i in error ok list newlist templist; do
+		touch "${TEMP_PREFIX}-${i}" || Fatal_Err "Error creating ${TEMP_PREFIX}-${i}. This shouldn't happen"
+	done
+	#
+}
+
+# Do some cleanup
+Cleanup_Repository() {
+	# current dir: ${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}
+	printf "Cleaning up repository folder... "
+	progress_init
+	trash_dir="${BASE_DIR}/trash/${trash_dir}/$(date -u "+${SITE_DIR}-${TARGET_DIR}-%Y%m%d-%H.%M")"
+	trashes="These files have been moved to ${trash_dir}:"
+	has_trash=
+	if [ ! -d "${trash_dir}" ]; then
+		mkdir -p "${trash_dir}" || Err_Impossible
+	else
+		if [ ! -O "${trash_dir}" ]; then
+			chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible
+		fi
+	fi
+	for trash in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"*
+	do
+		if [ -e "${trash}" ]; then
+			is_trash=
+			if [ -d "${trash}" ] || [ -n "$(is_not_md5 "${trash}")" ] || [ -z "$(grep "$(get_basename "${trash}")" "${TEMP_PREFIX}-list")" ]; then
+				is_trash=1
+				has_trash=1
+				mv -f -- "${trash}" "${trash_dir}" || Err_Impossible
+				trashes="${trashes}
+  $(get_basename "${trash}")"
+			fi
+		fi
+		progress_anim
+	done
+	rmdir "${trash_dir}" 2>/dev/null
+	progress_done
+	[ -n "${has_trash}" ] && echo "${trashes}"
+}
+
+# check files correctness
+Check_Files() {
+	if [ ! -n "${ISNEW}" ]; then
+		[ -z "${NOCLEAN}" ] && Cleanup_Repository
+		printf "Checking for errors... "
+		progress_init
+		files_error="These files do not match its md5:"
+		files_notdanbooru="These files are not checked:"
+		has_err_filename=
+		has_err_md5=
+		> "${TEMP_PREFIX}-error"
+		> "${TEMP_PREFIX}-ok"
+		for file in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"*
+		do
+			if [ -e "${file}" ]; then
+				if [ -n "$(is_not_md5 "${file}")" ] || [ -d "${file}" ]; then
+					files_notdanbooru="${files_notdanbooru}
+  $(get_basename "${file}")"
+					has_err_filename=1
+				else
+					if [ "$(get_md5 "${file}")" = "$(get_filename "${file}")" ]; then
+						echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-ok"
+					else
+						rm "${file}" || Err_Fatal "Error removing ${file}"
+						echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-error"
+						files_error="${files_error}
+  $(get_basename "${file}")"
+						has_err_md5=1
+					fi
+				fi
+			fi
+			progress_anim
+		done
+		progress_done
+		if [ ! -n "${has_err_md5}" ] && [ ! -n "${has_err_filename}" ]; then
+			echo "All files OK"
+		else
+			if [ -n "${has_err_md5}" ]; then
+				echo "${files_error}"
+				echo "$(grep -c . "${TEMP_PREFIX}-error") file(s) removed"
+			fi
+			[ -n "${has_err_filename}" ] && echo "${files_notdanbooru}" 
+		fi
+		echo "$(grep -c . "${TEMP_PREFIX}-ok") file(s) available locally"
+
+		printf "Generating list of new files... "
+		progress_init
+		cp -f "${TEMP_PREFIX}-list" "${TEMP_PREFIX}-templist"
+		while read -r is_ok; do
+			grep -v "${is_ok}" "${TEMP_PREFIX}-templist" > "${TEMP_PREFIX}-newlist"
+			cp -f "${TEMP_PREFIX}-newlist" "${TEMP_PREFIX}-templist" || Err_Impossible
+			progress_anim
+		done < "${TEMP_PREFIX}-ok"
+		progress_done
+		echo "$(grep -c . "${TEMP_PREFIX}-newlist") file(s) to be downloaded"		
+	else
+		if [ -n "${ISQUICK}" ]; then
+			echo "Quick mode selected. Skipping check"
+		else
+			echo "Empty local repository"
+		fi
+		cat "${TEMP_PREFIX}-list" > "${TEMP_PREFIX}-newlist"
+	fi
+}
+
+# start downloading the images
+Fetch_Images() {
+	if [ "$(grep -c . "${TEMP_PREFIX}-newlist")" -eq 0 ]; then
+		echo "No new file"
+	else
+		printf "Downloading files... "
+		cd "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}"
+		wget -e continue=on -i "${TEMP_PREFIX}-newlist" -o "${TEMP_PREFIX}.log" --referer="http://${SITE}/post" --user-agent="${useragent}"
+	fi
+}
+
+# initialize base variables and initial command check
+init()
+{
+	# path initialization
+	# check if additional path is specified
+	if [ -n "${ADDITIONAL_PATH}" ]
+	then
+		# insert the additional path
+		PATH="${ADDITIONAL_PATH}:${PATH}"
+		export PATH
+	fi
+	
+	# misc variables
+	ISQUICK=
+	ISNEW=
+	
+	# minimum number of arguments: 2 (command and tag). If less than two, exit and print help message
+	[ $# -lt 2 ] && Err_Help
+	case "$1" in
+		check|fetch|quickfetch)
+			echo "Starting..."
+			JOB="$1"
+		;;
+		*)
+			Err_Help
+		;;
+	esac
+	shift
+	SITE=
+	TAGS=
+	has_pass=0
+	has_user=0
+  x=1
+	while getopts "s:nu:p:" opt
+	do
+		case "$opt" in
+			s) SITE="$OPTARG";;
+			n) NOCLEAN=1;;
+			p)
+				LOGIN_PASS=$(printf "%s" "$OPTARG" | openssl dgst -sha1 | sed -e 's/.*\([[:xdigit:]]\{40\}\).*/\1/')
+				has_pass=1
+			;;
+			u)
+				LOGIN_USER="$OPTARG"
+				has_user=1
+			;;
+		esac
+    x=$OPTIND
+	done
+  shift $(($x-1))
+  if [ "$1" = -- ]; then shift; fi
+	TAGS="$@"
+	[ -n "${SITE}" ] || SITE=${DEFAULT_SITE}
+	[ -n "${TAGS}" ] || Err_Fatal "No tag specified"
+	# Get base folder - default, current folder or fallback to ${HOME}
+	[ -n "${BASE_DIR}" ] || BASE_DIR=${PWD}
+	[ -n "${BASE_DIR}" ] || BASE_DIR=${HOME}
+	[ -n "$(echo "${BASE_DIR}" | cut -c1 | grep \/)" ] || BASE_DIR="/${BASE_DIR}"
+	# see if both pass and use are set. If they're set, switch _use_login variable content to 1.
+	[ ${has_pass} -eq 1 -a ${has_user} -eq 1 ] && _use_login=1
+
+	echo "Tags: ${TAGS}"
+	# slash is not wanted for folder name
+	TARGET_DIR=$(echo "${TAGS}" | sed -e 's/\//_/g')
+	SITE_DIR=$(echo "${SITE}" | sed -e 's/\/$//g;s/\//_/g')
+	TEMP_PREFIX="${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}"
+}
+
+# global variables goes here
+init_globals()
+{
+	_version="1.0-rc3"	# version of this script
+	_use_login=0	# variable to check whether a login is used or not
+}
+
+main()
+{
+	# removing GNU-ism as much as possible
+	POSIXLY_CORRECT=1
+	#initialize global variables
+	init_globals
+	#print welcome message
+	msg_welcome
+	# initialization
+	init "$@"
+	Check_Tools
+	Check_Folders
+
+
+	# let's do the job!
+	case "${JOB}" in
+		check)
+			Generate_Link
+			Check_Files
+		;;
+		fetch)
+			Generate_Link
+			Check_Files
+			Fetch_Images
+		;;
+		quickfetch)
+			ISNEW=1
+			ISQUICK=1
+			Generate_Link
+			Check_Files
+			Fetch_Images
+		;;
+	esac
+}
+
+# call the main routine!
+main "$@"
+