Mercurial > ec-dotfiles
view bin/moefetch @ 639:c107e18388a2
Probably better diff algorithm
| author | nanaya <me@nanaya.pro> | 
|---|---|
| date | Sat, 29 Feb 2020 23:06:14 +0000 | 
| parents | 38c7615caf9e | 
| children | 
line wrap: on
 line source
#!/bin/sh # Copyright (c) 2009-2012, edogawaconan <edho@myconan.net> # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # Lots of bugs here. Use with care # USE WITH CARE # # what it does: fetch every picture that has the specified TAGS. # requirement: wget, libxslt, openssl # program additional paths for: cut, sed, wc, openssl, wget, xsltproc, grep ADDITIONAL_PATH= # default server address. Danbooru only! I do not take responsibility of stupidity. DEFAULT_SITE="yande.re" # base directory. make sure it's writeable. I do not take responsibility if you don't own the folder and files as no check is done for this one. # Structure is ${BASE_DIR}/<TAGS> # Absolute path only. # Leave empty to use whatever folder you're running this at BASE_DIR= # not user modifiable from here # stop on any error set -e # ensures all variables initialized set -u useragent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0) Gecko/20100101 Firefox/10.0" # useless welcome message. Also version msg_welcome() { echo "moefetch ${_version} Copyright (c) 2009-2012 edogawaconan <edho@myconan.net> " } # Sanitize path. Totally safe. Usage: cmd "$(safe_path "${filename}")" safe_path() { # It all depends on the first character. start=$(printf "%s" "$*" | cut -c 1) path= case "${start}" in .|/) path="$*";; # . and / is safe. No change. *) path="./$*";; # Anything else must be prefixed with ./ esac printf "%s" "${path}" # Return. } # Checks md5. OpenSSL should be available on anything usable. get_md5() { cat "$(safe_path "${1}")" | openssl dgst -md5 | tail -n 1 | sed -e 's/.*\([[:xdigit:]]\{32\}\).*/\1/'; } # Safely get basename. get_basename() { basename "$(safe_path "${1}")"; } # Safely get filename (basename without the extension). get_filename() { get_basename "${1%.*}"; } # Transformation for tag url. get_cleantags() { printf "%s " "$*" | sed -e 's/\&/%26/g;s/=/%3D/g'; } # Returns something if not an md5 value. is_not_md5() { get_filename "$1" | sed -e 's/\([0-9a-f]\{32\}\)//g'; } # fatal error handler Err_Fatal() { echo " Fatal error: ${1}" exit 1 } Err_Impossible() { echo " Impossible error. Or you modified content of the working directories when the script is running. Please report to moefetch.googlecode.com if you see this message (complete with entire run log)" exit 1 } # help message Err_Help() { echo "moefetch.sh COMMAND [-n] [-p PASSWORD] [-s SITE_URL] [-u USERNAME] TAGS COMMAND: (quick)fetch: Do a complete update. Add prefix quick to skip file checking check: Get list of new files, clean up local folder and print total new files OPTIONS: -n: Skip checking repository directory. -p PASSWORD: Specifies password for login. -s SITE_URL: Specify URL of the Danbooru powered site you want to leech from. Default is ${DEFAULT_SITE}. -u USERNAME: Specifies username for login. TAGS: Tags you want to download. Separated by spaces. Tag name follows standard Danbooru tagging scheme." exit 2 } # generate link by transforming xml Generate_Link() { echo " Fetching XML file" pagelimit=100 tempnum="${pagelimit}" iternum=1 > "${TEMP_PREFIX}-list" while [ "${tempnum}" -ge "${pagelimit}" ]; do url="${SITE}/post/index.xml?tags=$(get_cleantags "${TAGS}")&offset=0&limit=${pagelimit}&page=${iternum}" [ ${_use_login} -eq 1 ] && url="${url}&login=${LOGIN_USER}&password_hash=${LOGIN_PASS}" wget --no-check-certificate --quiet "${url}" -O "${TEMP_PREFIX}-xml" --referer="${SITE}/post" --user-agent="${useragent}" -e continue=off || Err_Fatal "Failed download catalog file" printf "Processing XML file... " # xslt evilry xsltproc - "${TEMP_PREFIX}-xml" <<EOF | sed 's/.*\(https*.*\)\(\/[a-f0-9]\{32\}\).*\.\([^\.]*\)/\1\2.\3/g' | grep ^http > "${TEMP_PREFIX}-templist" <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> <xsl:output method="xml" indent="yes"/> <xsl:template match="post"> <xsl:value-of select="@file_url" /> </xsl:template> </xsl:stylesheet> EOF tempnum=$(grep -c . "${TEMP_PREFIX}-templist") iternum=$((iternum + 1)) cat "${TEMP_PREFIX}-templist" >> "${TEMP_PREFIX}-list" echo "${tempnum} file(s) available" done numfiles=$(grep -c . "${TEMP_PREFIX}-list") echo "${numfiles} file(s) available on server" [ "${numfiles}" -gt 0 ] || Err_Fatal "Error in processing list or no files can be found with specified tag(s) or site." } progress_init() { _last="-" printf "${_last}" } progress_anim() { case "${_last}" in /) _last="-";; -) _last=\\;; \\) _last=\|;; \|) _last="/";; esac printf "\b${_last}" } progress_done() { printf "\bdone\n"; } # getting rid of ls (as per suggestion) Count_Files() { numfiles=0 for dircontent in "${*}/"* "${*}/".*; do if [ -e "${dircontent}" ] && [ x"${dircontent}" != x"${*}/." ] && [ x"${dircontent}" != x"${*}/.." ]; then numfiles=$((numfiles + 1)) fi done echo $((numfiles)) } # check tools availability Check_Tools() { # verify all programs required do indeed exist commands="cut sed wc wget xsltproc xargs rm mkdir chown comm grep date openssl" for cmd in ${commands} do [ "$(command -v "${cmd}")" ] || Err_Fatal "${cmd} doesn't exist in ${PATH}" done } # verify required folders exist and writeable Check_Folders(){ [ -O "${BASE_DIR}" ] || Err_Fatal "You don't own ${BASE_DIR}. Please fix ${BASE_DIR} or run this script in your own directory." for directory in temp trash deleted "${SITE_DIR}/${TARGET_DIR}"; do if [ ! -d "${BASE_DIR}/${directory}" ]; then mkdir -p "${BASE_DIR}/${directory}" || Err_Impossible fi if [ ! -O "${BASE_DIR}/${directory}" ]; then echo "You don't own the ${BASE_DIR}/${directory}, applying globally writeable permission on it" chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible fi done [ "$(Count_Files "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}")" -eq 0 ] && ISNEW=1 for i in error ok list newlist templist; do touch "${TEMP_PREFIX}-${i}" || Fatal_Err "Error creating ${TEMP_PREFIX}-${i}. This shouldn't happen" done # } # Do some cleanup Cleanup_Repository() { # current dir: ${BASE_DIR}/${SITE_DIR}/${TARGET_DIR} printf "Cleaning up repository folder... " progress_init trash_dir="${BASE_DIR}/trash/${trash_dir}/$(date -u "+${SITE_DIR}-${TARGET_DIR}-%Y%m%d-%H.%M")" trashes="These files have been moved to ${trash_dir}:" has_trash= if [ ! -d "${trash_dir}" ]; then mkdir -p "${trash_dir}" || Err_Impossible else if [ ! -O "${trash_dir}" ]; then chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible fi fi for trash in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"* do if [ -e "${trash}" ]; then is_trash= if [ -d "${trash}" ] || [ -n "$(is_not_md5 "${trash}")" ] || [ -z "$(grep "$(get_basename "${trash}")" "${TEMP_PREFIX}-list")" ]; then is_trash=1 has_trash=1 mv -f -- "${trash}" "${trash_dir}" || Err_Impossible trashes="${trashes} $(get_basename "${trash}")" fi fi progress_anim done rmdir "${trash_dir}" 2>/dev/null progress_done [ -n "${has_trash}" ] && echo "${trashes}" } # check files correctness Check_Files() { if [ ! -n "${ISNEW}" ]; then [ -z "${NOCLEAN}" ] && Cleanup_Repository printf "Checking for errors... " progress_init files_error="These files do not match its md5:" files_notdanbooru="These files are not checked:" has_err_filename= has_err_md5= > "${TEMP_PREFIX}-error" > "${TEMP_PREFIX}-ok" for file in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"* do if [ -e "${file}" ]; then if [ -n "$(is_not_md5 "${file}")" ] || [ -d "${file}" ]; then files_notdanbooru="${files_notdanbooru} $(get_basename "${file}")" has_err_filename=1 else if [ "$(get_md5 "${file}")" = "$(get_filename "${file}")" ]; then echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-ok" else rm "${file}" || Err_Fatal "Error removing ${file}" echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-error" files_error="${files_error} $(get_basename "${file}")" has_err_md5=1 fi fi fi progress_anim done progress_done if [ ! -n "${has_err_md5}" ] && [ ! -n "${has_err_filename}" ]; then echo "All files OK" else if [ -n "${has_err_md5}" ]; then echo "${files_error}" echo "$(grep -c . "${TEMP_PREFIX}-error") file(s) removed" fi [ -n "${has_err_filename}" ] && echo "${files_notdanbooru}" fi echo "$(grep -c . "${TEMP_PREFIX}-ok") file(s) available locally" printf "Generating list of new files... " progress_init cp -f "${TEMP_PREFIX}-list" "${TEMP_PREFIX}-templist" while read -r is_ok; do grep -v "${is_ok}" "${TEMP_PREFIX}-templist" > "${TEMP_PREFIX}-newlist" cp -f "${TEMP_PREFIX}-newlist" "${TEMP_PREFIX}-templist" || Err_Impossible progress_anim done < "${TEMP_PREFIX}-ok" progress_done echo "$(grep -c . "${TEMP_PREFIX}-newlist") file(s) to be downloaded" else if [ -n "${ISQUICK}" ]; then echo "Quick mode selected. Skipping check" else echo "Empty local repository" fi cat "${TEMP_PREFIX}-list" > "${TEMP_PREFIX}-newlist" fi } # start downloading the images Fetch_Images() { if [ "$(grep -c . "${TEMP_PREFIX}-newlist")" -eq 0 ]; then echo "No new file" else printf "Downloading files... " cd "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}" wget --no-check-certificate -e continue=on -i "${TEMP_PREFIX}-newlist" -o "${TEMP_PREFIX}.log" --referer="${SITE}/post" --user-agent="${useragent}" fi } # initialize base variables and initial command check init() { # path initialization # check if additional path is specified if [ -n "${ADDITIONAL_PATH}" ] then # insert the additional path PATH="${ADDITIONAL_PATH}:${PATH}" export PATH fi # misc variables ISQUICK= ISNEW= # minimum number of arguments: 2 (command and tag). If less than two, exit and print help message [ $# -lt 2 ] && Err_Help case "$1" in check|fetch|quickfetch) echo "Starting..." JOB="$1" ;; *) Err_Help ;; esac shift SITE= TAGS= has_pass=0 has_user=0 x=1 while getopts "s:nu:p:" opt do case "$opt" in s) SITE="$OPTARG";; n) NOCLEAN=1;; p) LOGIN_PASS=$(printf "%s" "$OPTARG" | openssl dgst -sha1 | sed -e 's/.*\([[:xdigit:]]\{40\}\).*/\1/') has_pass=1 ;; u) LOGIN_USER="$OPTARG" has_user=1 ;; esac x=$OPTIND done shift $(($x-1)) if [ "$1" = -- ]; then shift; fi TAGS="$@" [ -n "${SITE}" ] || SITE=${DEFAULT_SITE} [ -n "${TAGS}" ] || Err_Fatal "No tag specified" # Get base folder - default, current folder or fallback to ${HOME} [ -n "${BASE_DIR}" ] || BASE_DIR=${PWD} [ -n "${BASE_DIR}" ] || BASE_DIR=${HOME} [ -n "$(echo "${BASE_DIR}" | cut -c1 | grep \/)" ] || BASE_DIR="/${BASE_DIR}" # see if both pass and use are set. If they're set, switch _use_login variable content to 1. [ ${has_pass} -eq 1 -a ${has_user} -eq 1 ] && _use_login=1 echo "Tags: ${TAGS}" # slash is not wanted for folder name TARGET_DIR=$(echo "${TAGS}" | sed -e 's/\//_/g') SITE_DIR=$(echo "${SITE}" | sed -e 's/\/$//g;s/\//_/g') TEMP_PREFIX="${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}" } # global variables goes here init_globals() { _version="1.0-rc3" # version of this script _use_login=0 # variable to check whether a login is used or not } main() { # removing GNU-ism as much as possible POSIXLY_CORRECT=1 #initialize global variables init_globals #print welcome message msg_welcome # initialization init "$@" Check_Tools Check_Folders # let's do the job! case "${JOB}" in check) Generate_Link Check_Files ;; fetch) Generate_Link Check_Files Fetch_Images ;; quickfetch) ISNEW=1 ISQUICK=1 Generate_Link Check_Files Fetch_Images ;; esac } # call the main routine! main "$@"
