view bin/moefetch @ 364:7833673f4860

Added readme because I can.
author Edho Arief <edho@myconan.net>
date Sat, 02 Jun 2012 17:07:57 +0700
parents 391f2b64900e
children fc644e52df7c
line wrap: on
line source

#!/bin/sh

# Copyright (c) 2009-2012, edogawaconan <edho@myconan.net>
# 
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
# 
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
# Lots of bugs here. Use with care
# USE WITH CARE
#
# what it does: fetch every picture that has the specified TAGS.
# requirement: wget, libxslt, openssl

# program additional paths for: cut, sed, wc, openssl, wget, xsltproc, grep
ADDITIONAL_PATH=

# default server address. Danbooru only! I do not take responsibility of stupidity.
DEFAULT_SITE="yande.re"

# base directory. make sure it's writeable. I do not take responsibility if you don't own the folder and files as no check is done for this one.
# Structure is ${BASE_DIR}/<TAGS>
# Absolute path only.
# Leave empty to use whatever folder you're running this at
BASE_DIR=

# not user modifiable from here

# stop on any error
set -e
# ensures all variables initialized
set -u
useragent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0) Gecko/20100101 Firefox/10.0"

# useless welcome message. Also version
msg_welcome() {
	echo "moefetch ${_version}
Copyright (c) 2009-2012 edogawaconan <edho@myconan.net>
"
}

# Sanitize path. Totally safe. Usage: cmd "$(safe_path "${filename}")"
safe_path()
{
	# It all depends on the first character.
	start=$(printf "%s" "$*" | cut -c 1)
	path=
	case "${start}" in
		.|/) path="$*";; # . and / is safe. No change.
		*) path="./$*";; # Anything else must be prefixed with ./
	esac
	printf "%s" "${path}" # Return.
}

# Checks md5. OpenSSL should be available on anything usable.
get_md5() { cat "$(safe_path "${1}")" | openssl dgst -md5 | tail -n 1 | sed -e 's/.*\([[:xdigit:]]\{32\}\).*/\1/'; }

# Safely get basename.
get_basename() { basename "$(safe_path "${1}")"; }

# Safely get filename (basename without the extension).
get_filename() { get_basename "${1%.*}"; }

# Transformation for tag url.
get_cleantags() { printf "%s " "$*" | sed -e 's/\&/%26/g;s/=/%3D/g'; }

# Returns something if not an md5 value.
is_not_md5() { get_filename "$1" | sed -e 's/\([0-9a-f]\{32\}\)//g'; }


# fatal error handler
Err_Fatal() {
	echo "
Fatal error: ${1}"
	exit 1
}

Err_Impossible() {
	echo "
Impossible error. Or you modified content of the working directories when the script is running.
Please report to moefetch.googlecode.com if you see this message (complete with entire run log)"
	exit 1
}

# help message
Err_Help() {
	echo "moefetch.sh COMMAND [-n] [-p PASSWORD] [-s SITE_URL] [-u USERNAME] TAGS

COMMAND: 
	(quick)fetch:
		Do a complete update. Add prefix quick to skip file checking
	check:
		Get list of new files, clean up local folder and print total new files

OPTIONS:
	-n:
		Skip checking repository directory.
	-p PASSWORD:
		Specifies password for login.
	-s SITE_URL: 
		Specify URL of the Danbooru powered site you want to leech from. Default is ${DEFAULT_SITE}.
	-u USERNAME:
		Specifies username for login.
	TAGS: 
		Tags you want to download. Separated by spaces. Tag name follows standard Danbooru tagging scheme."
	exit 2
}

# generate link by transforming xml
Generate_Link() {
	echo "
Fetching XML file"
	tempnum=1000
	iternum=1
	> "${TEMP_PREFIX}-list"
	while [ "${tempnum}" -ge 1000 ]; do
		url="${SITE}/post/index.xml?tags=$(get_cleantags "${TAGS}")&offset=0&limit=1000&page=${iternum}"
		[ ${_use_login} -eq 1 ] && url="${url}&login=${LOGIN_USER}&password_hash=${LOGIN_PASS}"
		wget --no-check-certificate --quiet "${url}" -O "${TEMP_PREFIX}-xml" --referer="${SITE}/post" --user-agent="${useragent}" -e continue=off || Err_Fatal "Failed download catalog file"
		printf "Processing XML file... "
		# xslt evilry
		xsltproc - "${TEMP_PREFIX}-xml" <<EOF | sed 's/.*\(https*.*\)\(\/[a-f0-9]\{32\}\).*\.\([^\.]*\)/\1\2.\3/g' | grep ^http > "${TEMP_PREFIX}-templist"
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output method="xml" indent="yes"/>
<xsl:template match="post">
<xsl:value-of select="@file_url" />
</xsl:template>
</xsl:stylesheet>
EOF
		tempnum=$(grep -c . "${TEMP_PREFIX}-templist")
		iternum=$((iternum + 1))
		cat "${TEMP_PREFIX}-templist" >> "${TEMP_PREFIX}-list"
		echo "${tempnum} file(s) available"
	done
	numfiles=$(grep -c . "${TEMP_PREFIX}-list")
	echo "${numfiles} file(s) available on server"
	[ "${numfiles}" -gt 0 ] || Err_Fatal "Error in processing list or no files can be found with specified tag(s) or site."
}


progress_init() {
	_last="-"
	printf "${_last}"
}

progress_anim() {
	case "${_last}" in
		/) _last="-";;
		-) _last=\\;;
		\\) _last=\|;;
		\|) _last="/";;
	esac
	printf "\b${_last}"
}

progress_done() { printf "\bdone\n"; }

# getting rid of ls (as per suggestion)
Count_Files() {
	numfiles=0
	for dircontent in "${*}/"* "${*}/".*; do 
		if [ -e "${dircontent}" ] && [ x"${dircontent}" != x"${*}/." ] && [ x"${dircontent}" != x"${*}/.." ]; then 
			numfiles=$((numfiles + 1))
		fi
	done
	echo $((numfiles))
}

# check tools availability
Check_Tools() {
	# verify all programs required do indeed exist
	commands="cut sed wc wget xsltproc xargs rm mkdir chown comm grep date openssl"
	for cmd in ${commands}
	do
		 [ "$(command -v "${cmd}")" ] || Err_Fatal "${cmd} doesn't exist in ${PATH}"
	done
}

# verify required folders exist and writeable
Check_Folders(){
	[ -O "${BASE_DIR}" ] || Err_Fatal "You don't own ${BASE_DIR}. Please fix ${BASE_DIR} or run this script in your own directory."
	for directory in temp trash deleted "${SITE_DIR}/${TARGET_DIR}"; do
		if [ ! -d "${BASE_DIR}/${directory}" ]; then
			mkdir -p "${BASE_DIR}/${directory}" || Err_Impossible
		fi
		if [ ! -O "${BASE_DIR}/${directory}" ]; then
			echo "You don't own the ${BASE_DIR}/${directory}, applying globally writeable permission on it"
			chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible
		fi
	done
	[ "$(Count_Files "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}")" -eq 0 ] && ISNEW=1
	for i in error ok list newlist templist; do
		touch "${TEMP_PREFIX}-${i}" || Fatal_Err "Error creating ${TEMP_PREFIX}-${i}. This shouldn't happen"
	done
	#
}

# Do some cleanup
Cleanup_Repository() {
	# current dir: ${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}
	printf "Cleaning up repository folder... "
	progress_init
	trash_dir="${BASE_DIR}/trash/${trash_dir}/$(date -u "+${SITE_DIR}-${TARGET_DIR}-%Y%m%d-%H.%M")"
	trashes="These files have been moved to ${trash_dir}:"
	has_trash=
	if [ ! -d "${trash_dir}" ]; then
		mkdir -p "${trash_dir}" || Err_Impossible
	else
		if [ ! -O "${trash_dir}" ]; then
			chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible
		fi
	fi
	for trash in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"*
	do
		if [ -e "${trash}" ]; then
			is_trash=
			if [ -d "${trash}" ] || [ -n "$(is_not_md5 "${trash}")" ] || [ -z "$(grep "$(get_basename "${trash}")" "${TEMP_PREFIX}-list")" ]; then
				is_trash=1
				has_trash=1
				mv -f -- "${trash}" "${trash_dir}" || Err_Impossible
				trashes="${trashes}
  $(get_basename "${trash}")"
			fi
		fi
		progress_anim
	done
	rmdir "${trash_dir}" 2>/dev/null
	progress_done
	[ -n "${has_trash}" ] && echo "${trashes}"
}

# check files correctness
Check_Files() {
	if [ ! -n "${ISNEW}" ]; then
		[ -z "${NOCLEAN}" ] && Cleanup_Repository
		printf "Checking for errors... "
		progress_init
		files_error="These files do not match its md5:"
		files_notdanbooru="These files are not checked:"
		has_err_filename=
		has_err_md5=
		> "${TEMP_PREFIX}-error"
		> "${TEMP_PREFIX}-ok"
		for file in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"*
		do
			if [ -e "${file}" ]; then
				if [ -n "$(is_not_md5 "${file}")" ] || [ -d "${file}" ]; then
					files_notdanbooru="${files_notdanbooru}
  $(get_basename "${file}")"
					has_err_filename=1
				else
					if [ "$(get_md5 "${file}")" = "$(get_filename "${file}")" ]; then
						echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-ok"
					else
						rm "${file}" || Err_Fatal "Error removing ${file}"
						echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-error"
						files_error="${files_error}
  $(get_basename "${file}")"
						has_err_md5=1
					fi
				fi
			fi
			progress_anim
		done
		progress_done
		if [ ! -n "${has_err_md5}" ] && [ ! -n "${has_err_filename}" ]; then
			echo "All files OK"
		else
			if [ -n "${has_err_md5}" ]; then
				echo "${files_error}"
				echo "$(grep -c . "${TEMP_PREFIX}-error") file(s) removed"
			fi
			[ -n "${has_err_filename}" ] && echo "${files_notdanbooru}" 
		fi
		echo "$(grep -c . "${TEMP_PREFIX}-ok") file(s) available locally"

		printf "Generating list of new files... "
		progress_init
		cp -f "${TEMP_PREFIX}-list" "${TEMP_PREFIX}-templist"
		while read -r is_ok; do
			grep -v "${is_ok}" "${TEMP_PREFIX}-templist" > "${TEMP_PREFIX}-newlist"
			cp -f "${TEMP_PREFIX}-newlist" "${TEMP_PREFIX}-templist" || Err_Impossible
			progress_anim
		done < "${TEMP_PREFIX}-ok"
		progress_done
		echo "$(grep -c . "${TEMP_PREFIX}-newlist") file(s) to be downloaded"		
	else
		if [ -n "${ISQUICK}" ]; then
			echo "Quick mode selected. Skipping check"
		else
			echo "Empty local repository"
		fi
		cat "${TEMP_PREFIX}-list" > "${TEMP_PREFIX}-newlist"
	fi
}

# start downloading the images
Fetch_Images() {
	if [ "$(grep -c . "${TEMP_PREFIX}-newlist")" -eq 0 ]; then
		echo "No new file"
	else
		printf "Downloading files... "
		cd "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}"
		wget --no-check-certificate -e continue=on -i "${TEMP_PREFIX}-newlist" -o "${TEMP_PREFIX}.log" --referer="${SITE}/post" --user-agent="${useragent}"
	fi
}

# initialize base variables and initial command check
init()
{
	# path initialization
	# check if additional path is specified
	if [ -n "${ADDITIONAL_PATH}" ]
	then
		# insert the additional path
		PATH="${ADDITIONAL_PATH}:${PATH}"
		export PATH
	fi
	
	# misc variables
	ISQUICK=
	ISNEW=
	
	# minimum number of arguments: 2 (command and tag). If less than two, exit and print help message
	[ $# -lt 2 ] && Err_Help
	case "$1" in
		check|fetch|quickfetch)
			echo "Starting..."
			JOB="$1"
		;;
		*)
			Err_Help
		;;
	esac
	shift
	SITE=
	TAGS=
	has_pass=0
	has_user=0
  x=1
	while getopts "s:nu:p:" opt
	do
		case "$opt" in
			s) SITE="$OPTARG";;
			n) NOCLEAN=1;;
			p)
				LOGIN_PASS=$(printf "%s" "$OPTARG" | openssl dgst -sha1 | sed -e 's/.*\([[:xdigit:]]\{40\}\).*/\1/')
				has_pass=1
			;;
			u)
				LOGIN_USER="$OPTARG"
				has_user=1
			;;
		esac
    x=$OPTIND
	done
  shift $(($x-1))
  if [ "$1" = -- ]; then shift; fi
	TAGS="$@"
	[ -n "${SITE}" ] || SITE=${DEFAULT_SITE}
	[ -n "${TAGS}" ] || Err_Fatal "No tag specified"
	# Get base folder - default, current folder or fallback to ${HOME}
	[ -n "${BASE_DIR}" ] || BASE_DIR=${PWD}
	[ -n "${BASE_DIR}" ] || BASE_DIR=${HOME}
	[ -n "$(echo "${BASE_DIR}" | cut -c1 | grep \/)" ] || BASE_DIR="/${BASE_DIR}"
	# see if both pass and use are set. If they're set, switch _use_login variable content to 1.
	[ ${has_pass} -eq 1 -a ${has_user} -eq 1 ] && _use_login=1

	echo "Tags: ${TAGS}"
	# slash is not wanted for folder name
	TARGET_DIR=$(echo "${TAGS}" | sed -e 's/\//_/g')
	SITE_DIR=$(echo "${SITE}" | sed -e 's/\/$//g;s/\//_/g')
	TEMP_PREFIX="${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}"
}

# global variables goes here
init_globals()
{
	_version="1.0-rc3"	# version of this script
	_use_login=0	# variable to check whether a login is used or not
}

main()
{
	# removing GNU-ism as much as possible
	POSIXLY_CORRECT=1
	#initialize global variables
	init_globals
	#print welcome message
	msg_welcome
	# initialization
	init "$@"
	Check_Tools
	Check_Folders


	# let's do the job!
	case "${JOB}" in
		check)
			Generate_Link
			Check_Files
		;;
		fetch)
			Generate_Link
			Check_Files
			Fetch_Images
		;;
		quickfetch)
			ISNEW=1
			ISQUICK=1
			Generate_Link
			Check_Files
			Fetch_Images
		;;
	esac
}

# call the main routine!
main "$@"