view moefetch.sh @ 181:d3b7927bdb2b

restructuring and add check if the xml is processed properly
author edhoprima@gmail.com <edhoprima@gmail.com>
date Sun, 28 Jun 2009 05:12:41 +0000
parents 8e6555aa8631
children d92dfe857047
line wrap: on
line source

#!/bin/sh

# Copyright (c) 2009, edogawaconan <me@myconan.net>
# 
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
# 
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
# Lots of bugs here. Use with care
# USE WITH CARE
#
# what it does: fetch every picture that has the specified TAGS.
# requirement: wget, libxslt, md5sum (or md5)

# program additional paths for: cut, sed, wc, MD5(sum), wget, xsltproc, grep
ADDITIONAL_PATH=

# custom md5 path with arguments, expected output: <32digit md5><space(s)><filename>
# Leave empty for "md5sum" (Linux, Solaris), "md5 -r" (*BSD)
MD5=

# default server address. Danbooru only! I do not take responsibility of stupidity.
DEFAULT_SITE="moe.imouto.org"

# base directory. make sure it's writeable. I do not take responsibility if you don't own the folder and files as no check is done for this one.
# Structure is ${BASE_DIR}/<TAGS>
# Absolute path only.
# Leave empty to use whatever folder you're running this at
BASE_DIR=""

# not user modifiable from here

# useless welcome message. Also version
Msg_Welcome() {
	MOEFETCHVERSION="0.1-beta2"
	cat <<EOF
moefetch ${MOEFETCHVERSION}
Copyright (c) 2009 edogawaconan <me@myconan.net>

EOF
}

# fatal error handler
Err_Fatal() {
	echo "Fatal error: ${1}"
	exit 1
}

# help message
Err_Help() {
	cat <<EOF
moefetch.sh COMMAND [-s SITE_URL] TAGS

COMMAND: 
(quick)fetch: do a complete update. Add prefix quick to skip file checking
check: get list of new files, clean up local folder and print total new files

-s SITE_URL: Specify URL of the Danbooru powered site you want to leech from. Default is ${DEFAULT_SITE}

TAGS: Tags you want to download. Separated by spaces. Tag name follows standard Danbooru tagging scheme
	
EOF
	exit 0
}

# generate link by transforming xml
Generate_Link() {
	cd "${BASE_DIR}/temp"
	echo
	echo "Fetching xml file"
	wget "http://${SITE}/post/index.xml?tags=${TAGS}&offset=0&limit=100000" -O "${SITE_DIR}-${TARGET_DIR}-xml" -e continue=off
	echo "Processing XML file..."
	# xslt evilry
	xsltproc - "${SITE_DIR}-${TARGET_DIR}-xml" <<EOF | sed 's/.*\(http.*\)\(\/[a-f0-9]\{32\}\).*\.\([jp][pn]g\)/\1\2.\3/g' | grep ^http > "${SITE_DIR}-${TARGET_DIR}-list"
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output method="xml" indent="yes"/>
<xsl:template match="post">
<xsl:value-of select="@file_url" />
</xsl:template>
</xsl:stylesheet>
EOF
	NUMFILES=`echo \`wc -l < \"${SITE_DIR}-${TARGET_DIR}-list\" \``
	[ "${NUMFILES}" -gt 0 ] || Err_Fatal "Error in processing list or no files can be found with specified tag(s) or site"
	echo "${NUMFILES}" file(s) available on server"
	#output file: ${TARGET_DIR}-list
}

# check tools availability
Check_Tools() {
	# verify all programs required do indeed exist
	#MD5
	if [ ! "${MD5}" ]; then
		case `uname` in
			*BSD) MD5="md5 -r";;
			Linux|SunOS) MD5="md5sum";;
			*) Fatal_Err "No known md5 tool for this platform. Please specify manually"
		esac
	fi
	MD5_COMMAND=`echo ${MD5} | cut -d' ' -f1`
	# basic tools
	COMMANDS="cut sed wc wget xsltproc xargs rm mkdir chown comm grep ${MD5_COMMAND}"
	for COMMAND in ${COMMANDS}
	do
		COMMAND_CHECK=`command -v "${COMMAND}"`
		 [ "${COMMAND_CHECK}" ] || Err_Fatal "${COMMAND} doesn't exist in ${PATH}"
	done

	# grep checking
	# originally created for workaround on solaris
	#if [ `uname` = "SunOS" ]; then
	FAIL=""
	echo "blah" > superrandomtestfile
	echo "blah" > superrandomtestfile.2
	grep -f superrandomtestfile.2 superrandomtestfile > /dev/null 2>&1 || FAIL=1
	rm -f superrandomtestfile superrandomtestfile.2
	[ "${FAIL}" ] && Err_Fatal "Your grep is not compatible. Please install or set path of correct grep"
}

# verify required folders exist and writeable
Check_Folders(){
	[ -O "${BASE_DIR}" ] || Err_Fatal "You don't own ${BASE_DIR}. Please fix ${BASE_DIR}."
	for FOLDER in temp trash deleted "${SITE_DIR}/${TARGET_DIR}"; do
		if [ ! -d "${BASE_DIR}/${FOLDER}" ]; then
			mkdir "${BASE_DIR}/${FOLDER}" || Err_Fatal "${FOLDER} folder creation failed"
		fi
		if [ ! -O "${BASE_DIR}/${FOLDER}" ]; then
			echo "You don't own the ${BASE_DIR}/${FOLDER}, applying globally writeable permission on it"
			chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${FOLDER}" || Err_Fatal "Error changing ownership. This shouldn't happen"
		fi
	done
	[ `echo \`ls "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}" | wc -l\`` -eq 0 ] && ISNEW=1
	# let's move to workdir
	cd "${BASE_DIR}/temp"
	for i in error ok list newlist; do
		touch "${SITE_DIR}-${TARGET_DIR}-${i}" || Fatal_Err "Error creating ${TARGET_DIR}-${i}. This shouldn't happen"
	done
	#
}

# check files correctness
Check_Files() {
	if [ ! "${ISNEW}" ]; then
		echo "Checking for errors..."
		# THE FILES
		
		# current dir: ${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}
		cd "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}"
		TRASH_DIR="${SITE_DIR}-${TARGET_DIR}-`date -u +%Y%m%d-%H.%M`"
		mkdir -p "${BASE_DIR}/trash/${TRASH_DIR}"
		# FIXME FIXME FIXME FIXME FIXME FIXME 
		for TRASH in *
		do
			if [ -d "${TRASH}" ]; then
				mv -f "${TRASH}" "${BASE_DIR}/trash/${TRASH_DIR}" || Err_Fatal "Error deleting files"
				echo "Moved ${TRASH} to ${BASE_DIR}/trash/${TRASH_DIR}"
			else
				TRASH="`echo ${TRASH} | sed -e 's/\([0-9a-f]\{32\}.*\)//g' | grep -v ^$`"
				if [ "${TRASH}" ]; then
					mv -f "${TRASH}" "${BASE_DIR}/trash/${TRASH_DIR}" || Err_Fatal "Error deleting files"
					echo "Moved ${TRASH} to ${BASE_DIR}/trash/${TRASH_DIR}"
				fi
			fi
		done
		[ "`ls "${BASE_DIR}/trash/${TRASH_DIR}"`" ] || rmdir "${BASE_DIR}/trash/${TRASH_DIR}"
		printf "" > "${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}-error"
		for FILE in *
		do
			if [ "`${MD5} "${FILE}" | cut -d ' ' -f1 -`" != "`echo "${FILE}" | cut -d '.' -f1`" ]
			then
				echo
				echo "${FILE}" >> "${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}-error"
				echo "Error: ${FILE}"
			fi
			printf "."
		done
		echo
		TOTAL_ERROR=`echo \`wc -l < "${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}-error"\``
		echo "${TOTAL_ERROR} file(s) error"
		echo "Removing error files"
		if [ "${TOTAL_ERROR}" -eq 0 ]; then
			echo "No error file. 0 file removed"
		else
			cat "${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}-error" | xargs rm
			echo "${TOTAL_ERROR} file(s) removed"
		fi
		echo "`echo \`ls | wc -l\`` file(s) available locally"

		# current dir: ${BASE_DIR}/temp
		cd ${BASE_DIR}/temp

		echo "Generating list of new files..."
		# THE FILES
		#ls "../${TARGET_DIR}" | grep -vf "${TARGET_DIR}-error" > "${TARGET_DIR}-ok"
		#
		ls "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}" | comm -1 -3 "${SITE_DIR}-${TARGET_DIR}-error" - > "${SITE_DIR}-${TARGET_DIR}-ok"
		cat "${SITE_DIR}-${TARGET_DIR}-list" | grep -vf "${SITE_DIR}-${TARGET_DIR}-ok" > "${SITE_DIR}-${TARGET_DIR}-newlist"
		echo "`echo \`wc -l < \"${SITE_DIR}-${TARGET_DIR}-newlist\"\`` file(s) to be downloaded"
		
	else
		if [ "${ISQUICK}" ]; then
			echo "quick mode selected. Skipping check"
		else
			echo "Empty local repository"
		fi
		cd "${BASE_DIR}/temp"
		cat "${SITE_DIR}-${TARGET_DIR}-list" > "${SITE_DIR}-${TARGET_DIR}-newlist"
	fi
}

# start downloading the images
Fetch_Images() {
	cd "${BASE_DIR}/temp"
	if [ `echo \`wc -l < "${SITE_DIR}-${TARGET_DIR}-newlist"\`` -eq 0 ]; then
		echo "No new file"
	else
		echo "Starting wget"
		cd "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}"
		wget -e continue=on -bi "${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}-newlist" -o "${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}.log"
	fi
}

# initialize base variables and initial command check
Init(){
	# path initialization
	[ "${ADDITIONAL_PATH}" ] && PATH="${ADDITIONAL_PATH}:${PATH}"
	export PATH
	
	# misc variables
	ISQUICK=
	ISNEW=

	[ $# -lt 2 ] && Err_Help
	case "$1" in
		check|fetch|quickfetch)
			echo "Starting..."
			JOB="$1"
		;;
		*)
			Err_Help
		;;
	esac
	shift
	SITE=
	case "$1" in
		-s|--site)
			shift
			SITE="$1"
			shift
		;;
		*)
			SITE="${DEFAULT_SITE}"
		;;
	esac
	# Get base folder - default, current folder or fallback to ${HOME}
	[ "${BASE_DIR}"	] || BASE_DIR="${PWD}"
	[ "${BASE_DIR}" ] || BASE_DIR="{$HOME}"
	[ "`echo ${BASE_DIR} | cut -c1 | grep \/`" ] || BASE_DIR="/${BASE_DIR}"	

	TAGS="$@"
	echo "Tags: ${TAGS}"
	# slash is not wanted for folder name
	TARGET_DIR="`echo "${TAGS}" | sed -e 's/\//_/g'`"
	SITE_DIR="`echo "${SITE}" | sed -e 's/\/$//g;s/\//_/g'`"
}

# initialization
Msg_Welcome
Init "$@"
Check_Tools
Check_Folders


# let's do the job!
case "${JOB}" in
	check)
		Generate_Link
		Check_Files
	;;
	fetch)
		Generate_Link
		Check_Files
		Fetch_Images
	;;
	quickfetch)
		ISNEW=1
		ISQUICK=1
		Generate_Link
		Check_Files
		Fetch_Images
	;;
esac