view bin/moefetch @ 400:657f787fbae8

Merge remote-tracking branch 'origin/master'
author Edho Arief <edho@myconan.net>
date Tue, 04 Sep 2012 10:23:37 +0700
parents 38c7615caf9e
children
line wrap: on
line source

#!/bin/sh

# Copyright (c) 2009-2012, edogawaconan <edho@myconan.net>
# 
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
# 
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
# Lots of bugs here. Use with care
# USE WITH CARE
#
# what it does: fetch every picture that has the specified TAGS.
# requirement: wget, libxslt, openssl

# program additional paths for: cut, sed, wc, openssl, wget, xsltproc, grep
ADDITIONAL_PATH=

# default server address. Danbooru only! I do not take responsibility of stupidity.
DEFAULT_SITE="yande.re"

# base directory. make sure it's writeable. I do not take responsibility if you don't own the folder and files as no check is done for this one.
# Structure is ${BASE_DIR}/<TAGS>
# Absolute path only.
# Leave empty to use whatever folder you're running this at
BASE_DIR=

# not user modifiable from here

# stop on any error
set -e
# ensures all variables initialized
set -u
useragent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0) Gecko/20100101 Firefox/10.0"

# useless welcome message. Also version
msg_welcome() {
  echo "moefetch ${_version}
Copyright (c) 2009-2012 edogawaconan <edho@myconan.net>
"
}

# Sanitize path. Totally safe. Usage: cmd "$(safe_path "${filename}")"
safe_path()
{
  # It all depends on the first character.
  start=$(printf "%s" "$*" | cut -c 1)
  path=
  case "${start}" in
    .|/) path="$*";; # . and / is safe. No change.
    *) path="./$*";; # Anything else must be prefixed with ./
  esac
  printf "%s" "${path}" # Return.
}

# Checks md5. OpenSSL should be available on anything usable.
get_md5() { cat "$(safe_path "${1}")" | openssl dgst -md5 | tail -n 1 | sed -e 's/.*\([[:xdigit:]]\{32\}\).*/\1/'; }

# Safely get basename.
get_basename() { basename "$(safe_path "${1}")"; }

# Safely get filename (basename without the extension).
get_filename() { get_basename "${1%.*}"; }

# Transformation for tag url.
get_cleantags() { printf "%s " "$*" | sed -e 's/\&/%26/g;s/=/%3D/g'; }

# Returns something if not an md5 value.
is_not_md5() { get_filename "$1" | sed -e 's/\([0-9a-f]\{32\}\)//g'; }


# fatal error handler
Err_Fatal() {
  echo "
Fatal error: ${1}"
  exit 1
}

Err_Impossible() {
  echo "
Impossible error. Or you modified content of the working directories when the script is running.
Please report to moefetch.googlecode.com if you see this message (complete with entire run log)"
  exit 1
}

# help message
Err_Help() {
  echo "moefetch.sh COMMAND [-n] [-p PASSWORD] [-s SITE_URL] [-u USERNAME] TAGS

COMMAND: 
  (quick)fetch:
    Do a complete update. Add prefix quick to skip file checking
  check:
    Get list of new files, clean up local folder and print total new files

OPTIONS:
  -n:
    Skip checking repository directory.
  -p PASSWORD:
    Specifies password for login.
  -s SITE_URL: 
    Specify URL of the Danbooru powered site you want to leech from. Default is ${DEFAULT_SITE}.
  -u USERNAME:
    Specifies username for login.
  TAGS: 
    Tags you want to download. Separated by spaces. Tag name follows standard Danbooru tagging scheme."
  exit 2
}

# generate link by transforming xml
Generate_Link() {
  echo "
Fetching XML file"
  pagelimit=100
  tempnum="${pagelimit}"
  iternum=1
  > "${TEMP_PREFIX}-list"
  while [ "${tempnum}" -ge "${pagelimit}" ]; do
    url="${SITE}/post/index.xml?tags=$(get_cleantags "${TAGS}")&offset=0&limit=${pagelimit}&page=${iternum}"
    [ ${_use_login} -eq 1 ] && url="${url}&login=${LOGIN_USER}&password_hash=${LOGIN_PASS}"
    wget --no-check-certificate --quiet "${url}" -O "${TEMP_PREFIX}-xml" --referer="${SITE}/post" --user-agent="${useragent}" -e continue=off || Err_Fatal "Failed download catalog file"
    printf "Processing XML file... "
    # xslt evilry
    xsltproc - "${TEMP_PREFIX}-xml" <<EOF | sed 's/.*\(https*.*\)\(\/[a-f0-9]\{32\}\).*\.\([^\.]*\)/\1\2.\3/g' | grep ^http > "${TEMP_PREFIX}-templist"
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output method="xml" indent="yes"/>
<xsl:template match="post">
<xsl:value-of select="@file_url" />
</xsl:template>
</xsl:stylesheet>
EOF
    tempnum=$(grep -c . "${TEMP_PREFIX}-templist")
    iternum=$((iternum + 1))
    cat "${TEMP_PREFIX}-templist" >> "${TEMP_PREFIX}-list"
    echo "${tempnum} file(s) available"
  done
  numfiles=$(grep -c . "${TEMP_PREFIX}-list")
  echo "${numfiles} file(s) available on server"
  [ "${numfiles}" -gt 0 ] || Err_Fatal "Error in processing list or no files can be found with specified tag(s) or site."
}


progress_init() {
  _last="-"
  printf "${_last}"
}

progress_anim() {
  case "${_last}" in
    /) _last="-";;
    -) _last=\\;;
    \\) _last=\|;;
    \|) _last="/";;
  esac
  printf "\b${_last}"
}

progress_done() { printf "\bdone\n"; }

# getting rid of ls (as per suggestion)
Count_Files() {
  numfiles=0
  for dircontent in "${*}/"* "${*}/".*; do 
    if [ -e "${dircontent}" ] && [ x"${dircontent}" != x"${*}/." ] && [ x"${dircontent}" != x"${*}/.." ]; then 
      numfiles=$((numfiles + 1))
    fi
  done
  echo $((numfiles))
}

# check tools availability
Check_Tools() {
  # verify all programs required do indeed exist
  commands="cut sed wc wget xsltproc xargs rm mkdir chown comm grep date openssl"
  for cmd in ${commands}
  do
     [ "$(command -v "${cmd}")" ] || Err_Fatal "${cmd} doesn't exist in ${PATH}"
  done
}

# verify required folders exist and writeable
Check_Folders(){
  [ -O "${BASE_DIR}" ] || Err_Fatal "You don't own ${BASE_DIR}. Please fix ${BASE_DIR} or run this script in your own directory."
  for directory in temp trash deleted "${SITE_DIR}/${TARGET_DIR}"; do
    if [ ! -d "${BASE_DIR}/${directory}" ]; then
      mkdir -p "${BASE_DIR}/${directory}" || Err_Impossible
    fi
    if [ ! -O "${BASE_DIR}/${directory}" ]; then
      echo "You don't own the ${BASE_DIR}/${directory}, applying globally writeable permission on it"
      chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible
    fi
  done
  [ "$(Count_Files "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}")" -eq 0 ] && ISNEW=1
  for i in error ok list newlist templist; do
    touch "${TEMP_PREFIX}-${i}" || Fatal_Err "Error creating ${TEMP_PREFIX}-${i}. This shouldn't happen"
  done
  #
}

# Do some cleanup
Cleanup_Repository() {
  # current dir: ${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}
  printf "Cleaning up repository folder... "
  progress_init
  trash_dir="${BASE_DIR}/trash/${trash_dir}/$(date -u "+${SITE_DIR}-${TARGET_DIR}-%Y%m%d-%H.%M")"
  trashes="These files have been moved to ${trash_dir}:"
  has_trash=
  if [ ! -d "${trash_dir}" ]; then
    mkdir -p "${trash_dir}" || Err_Impossible
  else
    if [ ! -O "${trash_dir}" ]; then
      chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible
    fi
  fi
  for trash in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"*
  do
    if [ -e "${trash}" ]; then
      is_trash=
      if [ -d "${trash}" ] || [ -n "$(is_not_md5 "${trash}")" ] || [ -z "$(grep "$(get_basename "${trash}")" "${TEMP_PREFIX}-list")" ]; then
        is_trash=1
        has_trash=1
        mv -f -- "${trash}" "${trash_dir}" || Err_Impossible
        trashes="${trashes}
  $(get_basename "${trash}")"
      fi
    fi
    progress_anim
  done
  rmdir "${trash_dir}" 2>/dev/null
  progress_done
  [ -n "${has_trash}" ] && echo "${trashes}"
}

# check files correctness
Check_Files() {
  if [ ! -n "${ISNEW}" ]; then
    [ -z "${NOCLEAN}" ] && Cleanup_Repository
    printf "Checking for errors... "
    progress_init
    files_error="These files do not match its md5:"
    files_notdanbooru="These files are not checked:"
    has_err_filename=
    has_err_md5=
    > "${TEMP_PREFIX}-error"
    > "${TEMP_PREFIX}-ok"
    for file in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"*
    do
      if [ -e "${file}" ]; then
        if [ -n "$(is_not_md5 "${file}")" ] || [ -d "${file}" ]; then
          files_notdanbooru="${files_notdanbooru}
  $(get_basename "${file}")"
          has_err_filename=1
        else
          if [ "$(get_md5 "${file}")" = "$(get_filename "${file}")" ]; then
            echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-ok"
          else
            rm "${file}" || Err_Fatal "Error removing ${file}"
            echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-error"
            files_error="${files_error}
  $(get_basename "${file}")"
            has_err_md5=1
          fi
        fi
      fi
      progress_anim
    done
    progress_done
    if [ ! -n "${has_err_md5}" ] && [ ! -n "${has_err_filename}" ]; then
      echo "All files OK"
    else
      if [ -n "${has_err_md5}" ]; then
        echo "${files_error}"
        echo "$(grep -c . "${TEMP_PREFIX}-error") file(s) removed"
      fi
      [ -n "${has_err_filename}" ] && echo "${files_notdanbooru}" 
    fi
    echo "$(grep -c . "${TEMP_PREFIX}-ok") file(s) available locally"

    printf "Generating list of new files... "
    progress_init
    cp -f "${TEMP_PREFIX}-list" "${TEMP_PREFIX}-templist"
    while read -r is_ok; do
      grep -v "${is_ok}" "${TEMP_PREFIX}-templist" > "${TEMP_PREFIX}-newlist"
      cp -f "${TEMP_PREFIX}-newlist" "${TEMP_PREFIX}-templist" || Err_Impossible
      progress_anim
    done < "${TEMP_PREFIX}-ok"
    progress_done
    echo "$(grep -c . "${TEMP_PREFIX}-newlist") file(s) to be downloaded"    
  else
    if [ -n "${ISQUICK}" ]; then
      echo "Quick mode selected. Skipping check"
    else
      echo "Empty local repository"
    fi
    cat "${TEMP_PREFIX}-list" > "${TEMP_PREFIX}-newlist"
  fi
}

# start downloading the images
Fetch_Images() {
  if [ "$(grep -c . "${TEMP_PREFIX}-newlist")" -eq 0 ]; then
    echo "No new file"
  else
    printf "Downloading files... "
    cd "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}"
    wget --no-check-certificate -e continue=on -i "${TEMP_PREFIX}-newlist" -o "${TEMP_PREFIX}.log" --referer="${SITE}/post" --user-agent="${useragent}"
  fi
}

# initialize base variables and initial command check
init()
{
  # path initialization
  # check if additional path is specified
  if [ -n "${ADDITIONAL_PATH}" ]
  then
    # insert the additional path
    PATH="${ADDITIONAL_PATH}:${PATH}"
    export PATH
  fi
  
  # misc variables
  ISQUICK=
  ISNEW=
  
  # minimum number of arguments: 2 (command and tag). If less than two, exit and print help message
  [ $# -lt 2 ] && Err_Help
  case "$1" in
    check|fetch|quickfetch)
      echo "Starting..."
      JOB="$1"
    ;;
    *)
      Err_Help
    ;;
  esac
  shift
  SITE=
  TAGS=
  has_pass=0
  has_user=0
  x=1
  while getopts "s:nu:p:" opt
  do
    case "$opt" in
      s) SITE="$OPTARG";;
      n) NOCLEAN=1;;
      p)
        LOGIN_PASS=$(printf "%s" "$OPTARG" | openssl dgst -sha1 | sed -e 's/.*\([[:xdigit:]]\{40\}\).*/\1/')
        has_pass=1
      ;;
      u)
        LOGIN_USER="$OPTARG"
        has_user=1
      ;;
    esac
    x=$OPTIND
  done
  shift $(($x-1))
  if [ "$1" = -- ]; then shift; fi
  TAGS="$@"
  [ -n "${SITE}" ] || SITE=${DEFAULT_SITE}
  [ -n "${TAGS}" ] || Err_Fatal "No tag specified"
  # Get base folder - default, current folder or fallback to ${HOME}
  [ -n "${BASE_DIR}" ] || BASE_DIR=${PWD}
  [ -n "${BASE_DIR}" ] || BASE_DIR=${HOME}
  [ -n "$(echo "${BASE_DIR}" | cut -c1 | grep \/)" ] || BASE_DIR="/${BASE_DIR}"
  # see if both pass and use are set. If they're set, switch _use_login variable content to 1.
  [ ${has_pass} -eq 1 -a ${has_user} -eq 1 ] && _use_login=1

  echo "Tags: ${TAGS}"
  # slash is not wanted for folder name
  TARGET_DIR=$(echo "${TAGS}" | sed -e 's/\//_/g')
  SITE_DIR=$(echo "${SITE}" | sed -e 's/\/$//g;s/\//_/g')
  TEMP_PREFIX="${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}"
}

# global variables goes here
init_globals()
{
  _version="1.0-rc3"  # version of this script
  _use_login=0  # variable to check whether a login is used or not
}

main()
{
  # removing GNU-ism as much as possible
  POSIXLY_CORRECT=1
  #initialize global variables
  init_globals
  #print welcome message
  msg_welcome
  # initialization
  init "$@"
  Check_Tools
  Check_Folders


  # let's do the job!
  case "${JOB}" in
    check)
      Generate_Link
      Check_Files
    ;;
    fetch)
      Generate_Link
      Check_Files
      Fetch_Images
    ;;
    quickfetch)
      ISNEW=1
      ISQUICK=1
      Generate_Link
      Check_Files
      Fetch_Images
    ;;
  esac
}

# call the main routine!
main "$@"