#!/bin/bash #Author : Richard DEMONGEOT - http://www.demongeot.biz/outils/SiteMap.html #Ce fichier est sous licence GPL V3. #Je vous demanderai toutefois de bien vouloir me faire parvenir (par mail ou par #un autre moyen à votre convenance) les modifications que vous êtes suceptibles #d'y ajouter, ainsi que leurs justification. # #-------------------------------------------------------------------------------------- # #This program is free software; you can redistribute it and/or modify #it under the terms of the GNU General Public License as published by #the Free Software Foundation; either version 3 of the License. # #Thanks to send me any change you can wrote to this project and their reasons. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA if [ $# -ne 1 ] && [ $# -ne 2 ] then echo "$0 http[s]://mon.site.com [nom_de_la_variable_de_session]" #echo "$0 http[s]://mon.site.com [nom_de_la_variable_de_session nombre_de_caractere_pour_la_variable_de_session]" exit 1 fi #Variables BASEURL=$1 DOMAIN=`echo $BASEURL | cut -d '/' -f 3` CURRENTDIR=`pwd` TMP=$CURRENTDIR"/tmp" TEMP=$CURRENTDIR"/temp" RESULTDIR=$CURRENTDIR"/result" IGNORELIST="\.avi|\.mkv|\.pdf|#|\.jpg|\.png|\.tar|\(|\)" URLTODO=1 if [ $# -eq 2 ] then TREAT_SESS=1 SESSION_VAR_NAME=$2 else TREAT_SESS=0 fi #Mise en place de l'environnement if [ ! -d $TMP ] then mkdir -p $TMP else rm -rf $TMP mkdir -p $TMP fi if [ ! -d $TEMP ] then mkdir -p $TEMP else rm -rf $TEMP mkdir -p $TEMP fi if [ ! -d $RESULTDIR ] then mkdir -p $RESULTDIR else rm -rf $RESULTDIR mkdir -p $RESULTDIR fi #Fonctions #Fonction de suppression d'un fichier, avec virification de l'existance de celui-ci delete () { i=1 while [ $i -le $# ] do eval FILE=\$$i #eval sert a evaluer une commande. Ici, File va prendre pour valeur #le i-ieme argument de la fonction (le delete etant le 0) if [ -f $FILE ] then rm -f $FILE else echo Fichier $FILE introuvable fi i=`expr $i + 1` done } #Fonction pour eliminer les doublons dans fichier passe en argument #Cette fonction necessite la fonction delete declare ci-dessus ElimineDoublon () { sort $1 > /tmp/sort.tmp j="" for i in `cat /tmp/sort.tmp` do if [ "$i" != "$j" ] && [ "$j" != "" ] then echo $j >> /tmp/trt.tmp fi j=$i done echo $j >> /tmp/trt.tmp mv /tmp/trt.tmp $1 delete /tmp/sort.tmp } #Fonction pour tester et supprimer les fichiers qui sont vides #Cette fonction necessite la fonction delete declare ci-dessus TestSize () { FileSize=`ls -l $1 | awk {'print $5'}` if [ 0 -eq $FileSize ] then delete $1 fi } CrowlLink () { FOLDER1=`echo $1 | sed "s/^\///"` FOLDER=`echo $FOLDER1 | sed "s/\// /g"` NOMBRE=`echo $FOLDER | awk '{print NF}'` NOMBRE1=`expr $NOMBRE - 1` TESTFOLDER=`echo $1 | grep "/$"` if [ "" != "$TESTFOLDER" ] then NOMBRE=`expr $NOMBRE + 1` fi if [ $NOMBRE1 -ne 0 ] then REP=`echo $1 | cut -d '/' -f 2-$NOMBRE` else REP="" fi if [ ! -d $TEMP/$REP ] then mkdir -p $TEMP/$REP fi LINK=`echo "$1" | sed "s/&/\\&/"` echo $LINK cd $TEMP/$REP wget -c $BASEURL/$LINK FILE=`ls -ltr $TEMP/$REP | tail -n 1 | awk '{print $8}'` grep "href=" $TEMP/$REP/$FILE | grep " $TMP"/AllLinks" sed "s/.*href=//g" $TMP"/AllLinks" > $TMP"/AllLinks1" sed "s/>.*//g" $TMP"/AllLinks1" > $TMP"/AllLinks2" cut -d '"' -s -f 2 $TMP"/AllLinks2" > $TMP"/AllLinks3" egrep -v "http|ftp|mailto|^$" $TMP"/AllLinks3" > $TMP"/LocalLinks" grep $BASEURL $TMP"/AllLinks3" > $TMP"/LocalLinksATRT" for ligne in `cat $TMP"/LocalLinksATRT"` do LIGNE=`echo $ligne | sed "s/.*$DOMAIN//"` if [ $TREAT_SESS -eq 1 ] then LIGNE=`echo $LIGNE | sed "s/[?&]$SESSION_VAR_NAME[^&]*//"` grep "?" $LIGNE if [ $? -eq 1 ] then LIGNE=`echo $LIGNE | sed "s/&/?/"` fi fi grep $LIGNE $TMP"/LocalLinksShowing" if [ $? -eq 1 ] then echo $LIGNE >> $TMP"/LocalLinks" fi done egrep -v "$IGNORELIST" $TMP"/LocalLinks" >> $TMP"/LocalLinksShowing" cat $TMP"/LocalLinksShowing" | grep "^/" > $TMP"/GoodURL" cat $TMP"/LocalLinksShowing" | grep -v "^/" > $TMP"/BadURL" for ligne in `cat $TMP"/BadURL"` do if [ "" != "$REP" ] then echo "/"$REP"/"$ligne >> $TMP"/GoodURL" else echo "/"$ligne >> $TMP"/GoodURL" fi done mv $TMP"/GoodURL" $TMP"/LocalLinksShowing" #cut -d "'" -s -f 2 $TMP"/LocalLinksToSee" >> $TMP"/LocalLinksShowing" echo $1 >> $TMP"/crowled" #IGNORELIST=$IGNORELIST"|^$1\$" ElimineDoublon $TMP"/LocalLinksShowing" } #Main program echo "/" >> $TMP"/LocalLinksShowing" while [ $URLTODO -ne 0 ] do for URL in `cat $TMP"/LocalLinksShowing"` do EXIST=`grep ^$URL$ $TMP"/crowled"` if [ "$EXIST" = "" ] then CrowlLink $URL fi NBLIGNESHOWING=`cat $TMP"/LocalLinksShowing" | wc -l` NBLIGNECROWLED=`cat $TMP"/crowled" | wc -l` if [ $NBLIGNESHOWING -eq $NBLIGNECROWLED ] then URLTODO=0 fi done done sort $TMP"/crowled" > $TMP"/crowledb" echo "" > $RESULTDIR"/sitemap.xml" echo "> $RESULTDIR"/sitemap.xml" echo "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"" >> $RESULTDIR"/sitemap.xml" echo "xsi:schemaLocation=\"http://www.google.com/schemas/sitemap/0.84 http://www.google.com/schemas/sitemap/0.84/sitemap.xsd\">" >> $RESULTDIR"/sitemap.xml" for ligne in `cat $TMP"/crowledb" | grep "^/"` do echo " $BASEURL$ligne" | sed "s/\&/\&/g" >> $RESULTDIR"/sitemap.xml" done echo "" >> $RESULTDIR"/sitemap.xml"