comparison bin/moefetch @ 303:e4208bf9c585

Merge
author Edho Arief <edho@myconan.net>
date Thu, 16 Feb 2012 17:48:04 +0700
parents b90ebadbfd5d
children 21b86001b0c5
comparison
equal deleted inserted replaced
295:ce17ed77a7fa 303:e4208bf9c585
1 #!/bin/sh
2
3 # Copyright (c) 2009-2012, edogawaconan <edho@myconan.net>
4 #
5 # Permission to use, copy, modify, and/or distribute this software for any
6 # purpose with or without fee is hereby granted, provided that the above
7 # copyright notice and this permission notice appear in all copies.
8 #
9 # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 #
17 # Lots of bugs here. Use with care
18 # USE WITH CARE
19 #
20 # what it does: fetch every picture that has the specified TAGS.
21 # requirement: wget, libxslt, openssl
22
23 # program additional paths for: cut, sed, wc, openssl, wget, xsltproc, grep
24 ADDITIONAL_PATH=
25
26 # default server address. Danbooru only! I do not take responsibility of stupidity.
27 DEFAULT_SITE="moe.imouto.org"
28
29 # base directory. make sure it's writeable. I do not take responsibility if you don't own the folder and files as no check is done for this one.
30 # Structure is ${BASE_DIR}/<TAGS>
31 # Absolute path only.
32 # Leave empty to use whatever folder you're running this at
33 BASE_DIR=
34
35 # not user modifiable from here
36
37 useragent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0) Gecko/20100101 Firefox/10.0"
38
39 # useless welcome message. Also version
40 msg_welcome() {
41 echo "moefetch ${_version}
42 Copyright (c) 2009-2012 edogawaconan <edho@myconan.net>
43 "
44 }
45
46 # Sanitize path. Totally safe. Usage: cmd "$(safe_path "${filename}")"
47 safe_path()
48 {
49 # It all depends on the first character.
50 start=$(printf "%s" "$*" | cut -c 1)
51 path=
52 case "${start}" in
53 .|/) path="$*";; # . and / is safe. No change.
54 *) path="./$*";; # Anything else must be prefixed with ./
55 esac
56 printf "%s" "${path}" # Return.
57 }
58
59 # Checks md5. OpenSSL should be available on anything usable.
60 get_md5() { cat "$(safe_path "${1}")" | openssl dgst -md5 | tail -n 1 | sed -e 's/.*\([[:xdigit:]]\{32\}\).*/\1/'; }
61
62 # Safely get basename.
63 get_basename() { basename "$(safe_path "${1}")"; }
64
65 # Safely get filename (basename without the extension).
66 get_filename() { get_basename "${1%.*}"; }
67
68 # Transformation for tag url.
69 get_cleantags() { printf "%s " "$*" | sed -e 's/\&/%26/g;s/=/%3D/g'; }
70
71 # Returns something if not an md5 value.
72 is_not_md5() { get_filename "$1" | sed -e 's/\([0-9a-f]\{32\}\)//g'; }
73
74
75 # fatal error handler
76 Err_Fatal() {
77 echo "
78 Fatal error: ${1}"
79 exit 1
80 }
81
82 Err_Impossible() {
83 echo "
84 Impossible error. Or you modified content of the working directories when the script is running.
85 Please report to moefetch.googlecode.com if you see this message (complete with entire run log)"
86 exit 1
87 }
88
89 # help message
90 Err_Help() {
91 echo "moefetch.sh COMMAND [-n] [-p PASSWORD] [-s SITE_URL] [-u USERNAME] TAGS
92
93 COMMAND:
94 (quick)fetch:
95 Do a complete update. Add prefix quick to skip file checking
96 check:
97 Get list of new files, clean up local folder and print total new files
98
99 OPTIONS:
100 -n:
101 Skip checking repository directory.
102 -p PASSWORD:
103 Specifies password for login.
104 -s SITE_URL:
105 Specify URL of the Danbooru powered site you want to leech from. Default is ${DEFAULT_SITE}.
106 -u USERNAME:
107 Specifies username for login.
108 TAGS:
109 Tags you want to download. Separated by spaces. Tag name follows standard Danbooru tagging scheme."
110 exit 2
111 }
112
113 # generate link by transforming xml
114 Generate_Link() {
115 echo "
116 Fetching XML file"
117 tempnum=1000
118 iternum=1
119 > "${TEMP_PREFIX}-list"
120 while [ "${tempnum}" -ge 1000 ]; do
121 url="http://${SITE}/post/index.xml?tags=$(get_cleantags "${TAGS}")&offset=0&limit=1000&page=${iternum}"
122 [ ${_use_login} -eq 1 ] && url="${url}&login=${LOGIN_USER}&password_hash=${LOGIN_PASS}"
123 wget --quiet "${url}" -O "${TEMP_PREFIX}-xml" --referer="http://${SITE}/post" --user-agent="${useragent}" -e continue=off || Err_Fatal "Failed download catalog file"
124 printf "Processing XML file... "
125 # xslt evilry
126 xsltproc - "${TEMP_PREFIX}-xml" <<EOF | sed 's/.*\(http.*\)\(\/[a-f0-9]\{32\}\).*\.\([^\.]*\)/\1\2.\3/g' | grep ^http > "${TEMP_PREFIX}-templist"
127 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
128 <xsl:output method="xml" indent="yes"/>
129 <xsl:template match="post">
130 <xsl:value-of select="@file_url" />
131 </xsl:template>
132 </xsl:stylesheet>
133 EOF
134 tempnum=$(grep -c . "${TEMP_PREFIX}-templist")
135 iternum=$((iternum + 1))
136 cat "${TEMP_PREFIX}-templist" >> "${TEMP_PREFIX}-list"
137 echo "${tempnum} file(s) available"
138 done
139 numfiles=$(grep -c . "${TEMP_PREFIX}-list")
140 echo "${numfiles} file(s) available on server"
141 [ "${numfiles}" -gt 0 ] || Err_Fatal "Error in processing list or no files can be found with specified tag(s) or site."
142 }
143
144
145 progress_init() {
146 _last="-"
147 printf "${_last}"
148 }
149
150 progress_anim() {
151 case "${_last}" in
152 /) _last="-";;
153 -) _last=\\;;
154 \\) _last=\|;;
155 \|) _last="/";;
156 esac
157 printf "\b${_last}"
158 }
159
160 progress_done() { printf "\bdone\n"; }
161
162 # getting rid of ls (as per suggestion)
163 Count_Files() {
164 numfiles=0
165 for dircontent in "${*}/"* "${*}/".*; do
166 if [ -e "${dircontent}" ] && [ x"${dircontent}" != x"${*}/." ] && [ x"${dircontent}" != x"${*}/.." ]; then
167 numfiles=$((numfiles + 1))
168 fi
169 done
170 echo $((numfiles))
171 }
172
173 # check tools availability
174 Check_Tools() {
175 # verify all programs required do indeed exist
176 commands="cut sed wc wget xsltproc xargs rm mkdir chown comm grep date openssl"
177 for cmd in ${commands}
178 do
179 [ "$(command -v "${cmd}")" ] || Err_Fatal "${cmd} doesn't exist in ${PATH}"
180 done
181 }
182
183 # verify required folders exist and writeable
184 Check_Folders(){
185 [ -O "${BASE_DIR}" ] || Err_Fatal "You don't own ${BASE_DIR}. Please fix ${BASE_DIR} or run this script in your own directory."
186 for directory in temp trash deleted "${SITE_DIR}/${TARGET_DIR}"; do
187 if [ ! -d "${BASE_DIR}/${directory}" ]; then
188 mkdir -p "${BASE_DIR}/${directory}" || Err_Impossible
189 fi
190 if [ ! -O "${BASE_DIR}/${directory}" ]; then
191 echo "You don't own the ${BASE_DIR}/${directory}, applying globally writeable permission on it"
192 chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible
193 fi
194 done
195 [ "$(Count_Files "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}")" -eq 0 ] && ISNEW=1
196 for i in error ok list newlist templist; do
197 touch "${TEMP_PREFIX}-${i}" || Fatal_Err "Error creating ${TEMP_PREFIX}-${i}. This shouldn't happen"
198 done
199 #
200 }
201
202 # Do some cleanup
203 Cleanup_Repository() {
204 # current dir: ${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}
205 printf "Cleaning up repository folder... "
206 progress_init
207 trash_dir="${BASE_DIR}/trash/${trash_dir}/$(date -u "+${SITE_DIR}-${TARGET_DIR}-%Y%m%d-%H.%M")"
208 trashes="These files have been moved to ${trash_dir}:"
209 has_trash=
210 if [ ! -d "${trash_dir}" ]; then
211 mkdir -p "${trash_dir}" || Err_Impossible
212 else
213 if [ ! -O "${trash_dir}" ]; then
214 chmod -R u=rwX,g=rwX,o=rwX "${BASE_DIR}/${directory}" || Err_Impossible
215 fi
216 fi
217 for trash in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"*
218 do
219 if [ -e "${trash}" ]; then
220 is_trash=
221 if [ -d "${trash}" ] || [ -n "$(is_not_md5 "${trash}")" ] || [ -z "$(grep "$(get_basename "${trash}")" "${TEMP_PREFIX}-list")" ]; then
222 is_trash=1
223 has_trash=1
224 mv -f -- "${trash}" "${trash_dir}" || Err_Impossible
225 trashes="${trashes}
226 $(get_basename "${trash}")"
227 fi
228 fi
229 progress_anim
230 done
231 rmdir "${trash_dir}" 2>/dev/null
232 progress_done
233 [ -n "${has_trash}" ] && echo "${trashes}"
234 }
235
236 # check files correctness
237 Check_Files() {
238 if [ ! -n "${ISNEW}" ]; then
239 [ -z "${NOCLEAN}" ] && Cleanup_Repository
240 printf "Checking for errors... "
241 progress_init
242 files_error="These files do not match its md5:"
243 files_notdanbooru="These files are not checked:"
244 has_err_filename=
245 has_err_md5=
246 > "${TEMP_PREFIX}-error"
247 > "${TEMP_PREFIX}-ok"
248 for file in "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}/"*
249 do
250 if [ -e "${file}" ]; then
251 if [ -n "$(is_not_md5 "${file}")" ] || [ -d "${file}" ]; then
252 files_notdanbooru="${files_notdanbooru}
253 $(get_basename "${file}")"
254 has_err_filename=1
255 else
256 if [ "$(get_md5 "${file}")" = "$(get_filename "${file}")" ]; then
257 echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-ok"
258 else
259 rm "${file}" || Err_Fatal "Error removing ${file}"
260 echo "$(get_basename "${file}")" >> "${TEMP_PREFIX}-error"
261 files_error="${files_error}
262 $(get_basename "${file}")"
263 has_err_md5=1
264 fi
265 fi
266 fi
267 progress_anim
268 done
269 progress_done
270 if [ ! -n "${has_err_md5}" ] && [ ! -n "${has_err_filename}" ]; then
271 echo "All files OK"
272 else
273 if [ -n "${has_err_md5}" ]; then
274 echo "${files_error}"
275 echo "$(grep -c . "${TEMP_PREFIX}-error") file(s) removed"
276 fi
277 [ -n "${has_err_filename}" ] && echo "${files_notdanbooru}"
278 fi
279 echo "$(grep -c . "${TEMP_PREFIX}-ok") file(s) available locally"
280
281 printf "Generating list of new files... "
282 progress_init
283 cp -f "${TEMP_PREFIX}-list" "${TEMP_PREFIX}-templist"
284 while read -r is_ok; do
285 grep -v "${is_ok}" "${TEMP_PREFIX}-templist" > "${TEMP_PREFIX}-newlist"
286 cp -f "${TEMP_PREFIX}-newlist" "${TEMP_PREFIX}-templist" || Err_Impossible
287 progress_anim
288 done < "${TEMP_PREFIX}-ok"
289 progress_done
290 echo "$(grep -c . "${TEMP_PREFIX}-newlist") file(s) to be downloaded"
291 else
292 if [ -n "${ISQUICK}" ]; then
293 echo "Quick mode selected. Skipping check"
294 else
295 echo "Empty local repository"
296 fi
297 cat "${TEMP_PREFIX}-list" > "${TEMP_PREFIX}-newlist"
298 fi
299 }
300
301 # start downloading the images
302 Fetch_Images() {
303 if [ "$(grep -c . "${TEMP_PREFIX}-newlist")" -eq 0 ]; then
304 echo "No new file"
305 else
306 printf "Downloading files... "
307 cd "${BASE_DIR}/${SITE_DIR}/${TARGET_DIR}"
308 wget -e continue=on -i "${TEMP_PREFIX}-newlist" -o "${TEMP_PREFIX}.log" --referer="http://${SITE}/post" --user-agent="${useragent}"
309 fi
310 }
311
312 # initialize base variables and initial command check
313 init()
314 {
315 # path initialization
316 # check if additional path is specified
317 if [ -n "${ADDITIONAL_PATH}" ]
318 then
319 # insert the additional path
320 PATH="${ADDITIONAL_PATH}:${PATH}"
321 export PATH
322 fi
323
324 # misc variables
325 ISQUICK=
326 ISNEW=
327
328 # minimum number of arguments: 2 (command and tag). If less than two, exit and print help message
329 [ $# -lt 2 ] && Err_Help
330 case "$1" in
331 check|fetch|quickfetch)
332 echo "Starting..."
333 JOB="$1"
334 ;;
335 *)
336 Err_Help
337 ;;
338 esac
339 shift
340 SITE=
341 TAGS=
342 has_pass=0
343 has_user=0
344 x=1
345 while getopts "s:nu:p:" opt
346 do
347 case "$opt" in
348 s) SITE="$OPTARG";;
349 n) NOCLEAN=1;;
350 p)
351 LOGIN_PASS=$(printf "%s" "$OPTARG" | openssl dgst -sha1 | sed -e 's/.*\([[:xdigit:]]\{40\}\).*/\1/')
352 has_pass=1
353 ;;
354 u)
355 LOGIN_USER="$OPTARG"
356 has_user=1
357 ;;
358 esac
359 x=$OPTIND
360 done
361 shift $(($x-1))
362 if [ "$1" = -- ]; then shift; fi
363 TAGS="$@"
364 [ -n "${SITE}" ] || SITE=${DEFAULT_SITE}
365 [ -n "${TAGS}" ] || Err_Fatal "No tag specified"
366 # Get base folder - default, current folder or fallback to ${HOME}
367 [ -n "${BASE_DIR}" ] || BASE_DIR=${PWD}
368 [ -n "${BASE_DIR}" ] || BASE_DIR=${HOME}
369 [ -n "$(echo "${BASE_DIR}" | cut -c1 | grep \/)" ] || BASE_DIR="/${BASE_DIR}"
370 # see if both pass and use are set. If they're set, switch _use_login variable content to 1.
371 [ ${has_pass} -eq 1 -a ${has_user} -eq 1 ] && _use_login=1
372
373 echo "Tags: ${TAGS}"
374 # slash is not wanted for folder name
375 TARGET_DIR=$(echo "${TAGS}" | sed -e 's/\//_/g')
376 SITE_DIR=$(echo "${SITE}" | sed -e 's/\/$//g;s/\//_/g')
377 TEMP_PREFIX="${BASE_DIR}/temp/${SITE_DIR}-${TARGET_DIR}"
378 }
379
380 # global variables goes here
381 init_globals()
382 {
383 _version="1.0-rc3" # version of this script
384 _use_login=0 # variable to check whether a login is used or not
385 }
386
387 main()
388 {
389 # removing GNU-ism as much as possible
390 POSIXLY_CORRECT=1
391 #initialize global variables
392 init_globals
393 #print welcome message
394 msg_welcome
395 # initialization
396 init "$@"
397 Check_Tools
398 Check_Folders
399
400
401 # let's do the job!
402 case "${JOB}" in
403 check)
404 Generate_Link
405 Check_Files
406 ;;
407 fetch)
408 Generate_Link
409 Check_Files
410 Fetch_Images
411 ;;
412 quickfetch)
413 ISNEW=1
414 ISQUICK=1
415 Generate_Link
416 Check_Files
417 Fetch_Images
418 ;;
419 esac
420 }
421
422 # call the main routine!
423 main "$@"
424