Simple Script to Download epaper from The Hindustan

Print Friendly, PDF & Email
#!/bin/bash
#ishan dot karve at gmail dot com
#
#Script to download epaper from The Hindustan
#Written on request for Jitendra
#No more subscription .. pls donate the money to Prime Ministers Welfare Fund
#As always /// Its free to use...
#Get user to select edition.
##############################################################################
START=$(date +%s)

#############################################################################################################
#date mod so that script gets the most current edition 
#irrespective where it runs (india or server in orlando)
#script assumes that all digital edition are published on or after 6AM each day
#############################################################################################################

#by default export Indian Time Zone
export TZ=Asia/Calcutta
IST_hour=`date +%H`
IST_day=`date +%d`

#check current hour in india
if [ $((10#$IST_hour)) -lt 6 ]  # if script is executed before 6AM IST then export America/New_York Time Zone
	then
	export TZ=America/New_York
	echo "Using New York Time Zone"

elif [ $((10#$IST_hour)) -ge 6 ] #   if script is executed after 6AM IST do nothing as we have already exported IST
	then
	echo "Using Indian Time Zone"
fi

day=`date +%d`  #date compensated for IST.. server is 9 hours behind
nz_day=`date +%-d`
month=`date +%m`
year=`date +%Y`
nz_month=`date +%-m`
hour=`date +%H`
min=`date +%M`
sec=`date +%S`
datetime() { echo `date "+%Y-%m-%d %H:%M:%S"` ;}
##############################################################################

publication_name="The_Hindustan"
echo "$(datetime) $publication_name Script Started" 
#delete previous incomplete /complete downloads
echo "$(datetime) Cleaning up previous downloads" 
files=$(ls $HOME/public_html/news_archive/"$publication_name"_*.tar.gz 2> /dev/null | wc -l)
if [ "$files" != "0" ]
then 
rm $HOME/public_html/news_archive/"$publication_name"_*.tar.gz
fi
echo "$(datetime) Downloading Index Page" 
#extract page numbers from avl paper
url="http://epaper.livehindustan.com/PUBLICATIONS/HT/HT/$year/$month/$day/index.shtml"
#echo $url
curl -s $url > /tmp/Hindustan
echo "$(datetime) Parsing Avl Page Nos" 
pagenos=$(sed -n -e 's/.*PageImgNames=\(.*\);.*/\1/p' /tmp/Hindustan | sed "s/'//g")
#echo $pagenos
#Load page no in an array pgrecord
IFS=',' pgrecord=(${pagenos})

ty_dir="$HOME/public_html/news_scripts/ty_dld/The_Hindustan_`date --date= +%d`_`date --date= +%m`_`date --date= +%Y`"
#ty_dir="$HOME/Desktop/The_Hindustan_"$day"_"$month"_$year"
#echo $ty_dir
#mkdir to store individual pages
if [ ! -d "$ty_dir" ]; then mkdir $ty_dir;fi

#spider the selected edition using wget to estimate number of pages
#define max incremental page limit
#echo "$(datetime) Please be patient..Bandwidth intensive operation starts..;-)" 
echo "$(datetime) Downloading The Hindustan Paper .. total $npages pages"
pages_downloaded=0

pg_array_size=${#pgrecord[@]}
for ((  i =  0;  i < $pg_array_size;  i++  ))     do                  #echo -n "$(datetime) Downloading Page $pageno"       O_FILE="$ty_dir/${pgrecord[i]}.pdf"       I_FILE="http://epaper.livehindustan.com/PUBLICATIONS/HT/HT/$year/$month/$day/PagePrint/"$day"_"$month"_"$year"_${pgrecord[i]}.pdf"	       #echo $I_FILE       #echo $O_FILE       #curl  $I_FILE -o "$O_FILE"       debug=`wget -c -O $O_FILE $I_FILE 2>&1`
      echo $debug
      #echo "....Completed Download:-)"
     ((pages_downloaded=pages_downloaded+1))	
    done

echo "$(datetime) Downloaded $pages_downloaded pages"
if [ $pages_downloaded -ne 0 ]
then 
echo "$(datetime) Combining all pages into a single tar.gz archive" 
#combine multiple pdf files
archive_name="$publication_name"_"$day"_"$month"_"$year".tar.gz
#cd to document source diectory
cd $ty_dir
#tar documents
tar -zcf $HOME/public_html/news_archive/$archive_name *.pdf
#empty directory
rm $ty_dir/*.pdf
# get archive file size
arch_size=$(stat -c %s  $HOME/public_html/news_archive/$archive_name)
	if [[ $arch_size -gt "1024" ]]
	then
	END=$(date +%s)
	DIFF=$(( $END - $START ))
	echo "$(datetime) Script Execution Completed in $DIFF seconds" 
	echo "$(datetime) File Location $HOME/public_html/news_archive/$archive_name ($arch_size bytes)" 
	fi
	if [[ $arch_size -lt "1024" ]]
	then
	END=$(date +%s)
	DIFF=$(( $END - $START ))
	echo "$(datetime) Script Execution Completed in $DIFF seconds" 
	echo "$(datetime) Script Execution Failed.. FileSize too Small $arch_size bytes"
	rm $HOME/public_html/news_archive/$archive_name
	if [ -d "$ty_dir" ]; then rmdir $ty_dir;fi

	fi
fi

if [ $pages_downloaded -eq 0 ]
then
echo "$(datetime) Script Execution Failed"
echo "$(datetime) Last Debug .. $debug"
echo "$(datetime) Last Debug .. $I_FILE"
#remove directory
if [ -d "$ty_dir" ]; then rmdir $ty_dir;fi

fi

How to get it running

Copy the script to your Linux desktop
go to command prompt using terminal
type following commands

cd ~/Desktop
chmod +x thehindustan.sh
./thehindustan.sh

Ishan Karve

About Ishan Karve

Ishan Karve is just an every day normal guy next door who happens to be an Electronics Engineer by profession and dabbles with PHP, Javascript, C++ and python. His interests vary as seasons change.. they change from astronomy to soul searching. This site is just a reflection of what he does to keep his mind engaged when he is not occupied by work and family. He is an extremely objective guy and is always ready for some good arguments.. of course over a glass of 40% proof alcohol.
This entry was posted in BASH, Programming. Bookmark the permalink.

One Response to Simple Script to Download epaper from The Hindustan

  1. Kanhialal says:

    Hi
    Friends I am going to design website like.
    sindhexpress.com.pk/epaper/index.html
    I have found here article related with my requirement kindly can any one help me to do my job.
    I have need of php or asp.net script.
    Thanks.

Leave a Reply

Your email address will not be published. Required fields are marked *