Script to download e-newspapers from this site

Print Friendly, PDF & Email

Following is the script to download e-newspapers from this site.

Requires a linux box with preferably ghostscript installed.How to use:

  1. Copy and paste the code in an empty file .
  2. save the file as …anything.sh.. say foo.sh
  3. chmod +x the file using terminal
  4. execute the file using terminal
Code
#!/bin/bash
#date definitiosn
clear

day=`date +%d`
nz_day=`date +%-d`
month=`date +%m`
year=`date +%Y`
nz_month=`date +%-m`
hour=`date +%H`
min=`date +%M`
sec=`date +%S`
#dowload page containing newspaper links
url="http://www.karve.in/news_archive/"
curl -s $url > /tmp/karve
#extract html links from the downloaded page
links=$(cat /tmp/karve | grep "]*\).*/\1/" |  sed "s/\"//g" |  tr -d "\n\t" )
IFS=' ' url_links=(${links})
#get number of downloads urls
links_size=${#url_links[@]} 
#define ultimate download directory
dl_dir=$HOME/Desktop/news_"$day"_"$month"_"$year"
#if directory doesnot exists then create it
if [ ! -d "$dl_dir" ]; then mkdir $dl_dir;fi
#iterate through array and download

for ((  i =  0;  i < $links_size;  i++  ))
    do
     gs_flag=1 #ghostscript flasg
     #cd to download directory
     cd $dl_dir
     #extract filename from url
     dl_link=${url_links[i]}
     echo "Processing Link $dl_link"
     fname=$( basename $dl_link )
	 #strip file extension from filename
	 filename=${fname%.*} #strip .gz
	 filename=${filename%.*} #strip .tar
	 #download file
	 echo -n "Downloaing $fname...."
	 debug=`wget -c ${url_links[i]} 2>&1`
     echo "done"
     #make a directory of current filename
     if [ ! -d "$filename" ]; then mkdir $filename;fi
     #extract archive to above directory
     tar xvf $fname -C ./$filename >/dev/null 2>&1
     #check if ghost script is installed
     which gs &> /dev/null
	 if [ $? -ne 0 ]; then gs_flag=0; fi
     if  [ $gs_flag -eq 1 ]
     then
     #combine into a single psdf file
     #remove existing file if ant
     if [ -f "$filename.pdf" ]; then rm $filename.pdf;fi
     echo "##############################################################"
     echo "Combining individual documents into a single pdf document.."
     gs -q -sstdout=%sstderr -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=$filename.pdf -dBATCH ./$filename/*.pdf 2>/dev/null
     echo "E-paper Available at $dl_dir/$filename.pdf"
     echo "##############################################################"
     #remove individual pdf files
     rm ./$filename/*.pdf
     #remove directory
     rmdir ./$filename
	 #remove downloaded file
	 rm $fname 	
	 fi
     if  [ $gs_flag -eq 0 ]
     then
     echo "##############################################################"
     echo "Ghost script not found installed.."
     echo "Install ghost script .. type sudo aptt-get install gs"
     echo "Files have been downloaded at $dl_dir/$filename"
     echo "##############################################################"
   
     fi
    done


Python Script

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#       ishan.py
#       
#       Copyright 2011 vroom 
#       
#       This program is free software; you can redistribute it and/or modify
#       it under the terms of the GNU General Public License as published by
#       the Free Software Foundation; either version 2 of the License, or
#       (at your option) any later version.
#       
#       This program is distributed in the hope that it will be useful,
#       but WITHOUT ANY WARRANTY; without even the implied warranty of
#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#       GNU General Public License for more details.
#       
#       You should have received a copy of the GNU General Public License
#       along with this program; if not, write to the Free Software
#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#       MA 02110-1301, USA.
#       
#      Script chain of events
#      	Step 1	: Download www.karve.in/news_archive/index.php   (Requires Curl)
#		Step 2	: Parse the file and extract links from it   (Requires Beautiful Soup Libarary)
#		Step 2.1: Check if the downloaded link end with gz extension and hence is a valid download link   
#		Step 2.2: Extract the date and file name from the url link
#       Step 3	: Download the file from the server
#      	Step 4	: Extract the downloaded file
#       Step 5	: Check if Ghostscript present
#      	Step 6	: If GS present then integrate induvidual pages into a single pdf document
#       
#     Basic Function Defs        
#       wget_file(srcurl,destdir,destfilename)
#       download_file(srcurl,destdir,destfilename)
#       extract_file(src_file,dest_dir)
#      	integrate_file(src_dir,dest_filename)
#       
#      
#	Reference
#	http://pythonicprose.blogspot.com/2009/10/python-extract-or-unzip-tar-file.html
##################################################################################################################       

import sys
import pycurl
import StringIO
import tarfile
import datetime
from BeautifulSoup import BeautifulSoup
import os
import stat

def nukedir(dirname):
	del_file_command="rm " + dirname + "/*.pdf"
	rm_dir_command="rmdir " + dirname
	os.system(del_file_command)
	os.system(rm_dir_command)

def wget_download(src_url,dest_dir,dest_filename):
	file_path=dest_dir + "/" + dest_filename
	wget_command="wget -c -O " + file_path + " " + src_url + " 2>&1"
	os.system(wget_command)



def download_file(src_url,dest_dir,dest_filename): #not being used
	recv_data= StringIO.StringIO()
	file_path=dest_dir + "/" + dest_filename
	dl_handle=pycurl.Curl()
	dl_handle.setopt(dl_handle.URL,src_url)
	dl_handle.setopt(dl_handle.WRITEFUNCTION, recv_data.write)
	dl_handle.perform()
	fl_handle = open(file_path, 'w')
	fl_handle.write(recv_data.getvalue())
	fl_handle.close()
	dl_handle.close()


def extract_file(src_file,dest_dir):
	# check file is tarfile or not
	# extract tarfile if true
	# if false return error
	if tarfile.is_tarfile(src_file):
		# open the tar file
		tfile = tarfile.open(src_file)
		# extract all contents
		#make a ty folder called pdfs
		ext_dir=dest_dir + "/pdfs"
		if not os.path.exists(ext_dir):
			os.makedirs(ext_dir)
		tfile.extractall(ext_dir)
	else:
		print src_file + " is not a valid tar archive."


def merge_pdf(src_dir,dest_dir,dest_filename):
	#downloaded files are extracted to a sub dir called pdfs
	pdf_dir=src_dir + "/pdfs"
	#uses ghostscript to merge pdf
	file_path=dest_dir + "/" + dest_filename
	gs_command="gs -q -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE=" + file_path + " -dBATCH " + pdf_dir + "/*.pdf 2>/dev/null"
	os.system(gs_command)
	#delete extracted files and delete directory
	nukedir(pdf_dir)
	
	
	
	
	
	

url="http://www.karve.in/news_archive/index.php"
#define download directory
dl_dir= os.getenv("HOME") + "/news_archive"
#check if root parent directory exists if not make it
if not os.path.exists(dl_dir):
	os.makedirs(dl_dir)
#get current date parts like day month year
now = datetime.datetime.now()
day= now.day
month=now.month
year=now.year
#dowload page containing newspaper links
url="http://www.karve.in/news_archive/index.php"
ty_dl_dir="/tmp"
ty_file_name="karve"
download_file(url,ty_dl_dir,ty_file_name)
#open the downloaded file and parse it to find links
#using Beautiful Soup library
url_file = ty_dl_dir + "/" + ty_file_name
f = open(url_file, 'r')
read_data = f.read()
soup = BeautifulSoup(read_data)
for tag in soup.findAll('a', href=True):
	link= tag['href']
	#check whether the link terminates with a .gz extension
	gz = os.path.splitext(link)[1]
	if (gz == ".gz"): #then we have a valid html link
		#extract primary filename from the download link
		archive_name=os.path.basename(link)
		#remove dot tar dot gz = 7 characters
		str_len=len(archive_name)
		content_name = archive_name[4:str_len-18] #4 ofset is given to strip "The_"
		#extract date part from the archive name above
		content_date = archive_name[str_len-17:str_len-7] #4 ofset is given to strip "The_"
		#make a parent directory to hold content
		content_dir=dl_dir + "/" + content_name
		if not os.path.exists(content_dir):
			os.makedirs(content_dir)
		#download content
		print "Staring to download content for " + content_name
		#download file using wget
		#wget_download(link,content_dir,archive_name)
		#extract the archive
		extract_file(content_dir + "/" + archive_name,content_dir)
		#merge pdf
		merge_pdf(content_dir,content_dir,content_name)
		
f.close
Ishan Karve
This entry was posted in BASH, Programming and tagged , , , , , , , , , , . Bookmark the permalink.

Leave a Reply

Your email address will not be published. Required fields are marked *