加速下载NCBI数据库

直接放烂码 利用了两个工具 ascp 和 axel

#!/bin/bash
#
#	download_SURPI_data.sh
#
#	This program will download the data files necessary to construct SURPI reference data.
#	It verifies the md5sum if available.
#
#	Chiu Laboratory
#	University of California, San Francisco
#	January, 2014
#
# Copyright (C) 2014 Scot Federman - All Rights Reserved
# SURPI has been released under a modified BSD license.
# Please see license file for details.
# Last revised 7/2/2014  

scriptname=${0##*/}
bold=$(tput bold)
normal=$(tput sgr0)
green='\e[0;32m'
red='\e[0;31m'
endColor='\e[0m'
DATE=$(date +%m%d%Y)
destination_dir="NCBI_$DATE"
curated_dir="curated_$DATE"

while getopts ":hd:c:" option; do
	case "${option}" in
		h) HELP=1;;
		d) destination_dir=${OPTARG};;
		c) curated_dir=${OPTARG};;
		:)	echo "Option -$OPTARG requires an argument." >&2
			exit 1
      		;;
	esac
done

if [[ ${HELP-} -eq 1 ]]
then
	cat <<USAGE
	
${bold}$scriptname${normal}

This program will download necessary files from NCBI for use with SURPI. 

${bold}Command Line Switches:${normal}

	-h	Show this help

	-d	Specify directory to create for downloaded data
		(optional. If unsupplied, will default to NCBI_[current date] )

	-c	Specify directory to create for curated data
		(optional. If unsupplied, will default to curated_[current date] )

${bold}Usage:${normal}

$scriptname -d NCBI_07022014

USAGE
	exit
fi

NCBI="ftp-private.ncbi.nlm.nih.gov"
chiulab_dir="http://chiulab.ucsf.edu/SURPI/databases"
FASTA_dir="/blast/db/FASTA/"
TAXONOMY_dir="/pub/taxonomy/"

nt="nt.gz"
nt_md5="nt.gz.md5"

nr="nr.gz"
nr_md5="nr.gz.md5"

taxdump="taxdump.tar.gz"
taxdump_md5="taxdump.tar.gz.md5"

#These files do not have md5 (as of 6/2014)
gi_taxid_nucl="gi_taxid_nucl.dmp.gz"
gi_taxid_prot="gi_taxid_prot.dmp.gz"

download_file ()
{
	destination_folder=$1
	remote_dir=$2
	file=$3
    type=$4
	md5=$5
    if [ $type != "ncbi" ]; then
        ( cd $destination_folder ; axel -n 10 "$remote_dir/$file" )
    else
        ( cd $destination_folder ; ascp -l 100m -T -i ~/.aspera/connect/etc/asperaweb_id_dsa.openssh --user anonftp --mode recv --host $remote_dir $file . )
	fi
    
	if [[ $md5 ]]
	then
		( cd $destination_folder ; curl -O "$remote_dir/$md5" )
		( cd $destination_folder ; md5sum -c --status "$md5" )
		if [ $? -ne 0 ]; then
			echo -e "${red}md5check of $file: failed.${endColor}"
			exit
		else
			echo -e "${green}md5sum of $file: OK${endColor}"
		fi
	fi
}

#
## Download NCBI data
#

if [ ! -d "$destination_dir" ]; then
	mkdir "$destination_dir"
fi

if [ ! -f "$destination_dir/$nt" ]; then
	echo -e "$(date)\t$scriptname\tDownloading $nt"
	download_file "$destination_dir" "$NCBI" "$FASTA_dir$nt" "ncbi" "$nt_md5"
else
	echo -e "$(date)\t$scriptname\t$nt already present."
fi

if [ ! -f "$destination_dir/$nr" ]; then
	echo -e "$(date)\t$scriptname\tDownloading $nr"
	download_file "$destination_dir" "$NCBI" "$FASTA_dir$nr" "ncbi" "$nr_md5"
else
	echo -e "$(date)\t$scriptname\t$nr already present."
fi

if [ ! -f "$destination_dir/$taxdump" ]; then
	echo -e "$(date)\t$scriptname\tDownloading $taxdump"
	download_file "$destination_dir" "$NCBI" "$TAXONOMY_dir$taxdump" "ncbi" "$taxdump_md5"
else
	echo -e "$(date)\t$scriptname\t$taxdump already present."
fi

if [ ! -f "$destination_dir/$gi_taxid_nucl" ]; then
	echo -e "$(date)\t$scriptname\tDownloading $gi_taxid_nucl"
	download_file "$destination_dir" "$NCBI" "$TAXONOMY_dir$gi_taxid_nucl" "ncbi"
else
	echo -e "$(date)\t$scriptname\t$gi_taxid_nucl already present."
fi

if [ ! -f "$destination_dir/$gi_taxid_prot" ]; then
	echo -e "$(date)\t$scriptname\tDownloading $gi_taxid_prot"
	download_file "$destination_dir" "$NCBI" "$TAXONOMY_dir$gi_taxid_prot" "ncbi"
else
	echo -e "$(date)\t$scriptname\t$gi_taxid_prot already present."
fi

#
## Download Chiulab curated data
#
declare -a download_list=(	"Bacterial_Refseq_05172012.CLEAN.LenFiltered.uniq.fa.gz" \
							"hg19_rRNA_mito_Hsapiens_rna.fa.gz" \
							"rapsearch_viral_aa_130628_db_v2.12.fasta.gz" \
							"viruses-5-2012_trimmedgi-MOD_addedgi.fa.gz" \
							"18s_rRNA_gene_not_partial.fa.gz" "23s.fa.gz" \
							"28s_rRNA_gene_NOT_partial_18s_spacer_5.8s.fa.gz" \
							"rdp_typed_iso_goodq_9210seqs.fa.gz")

if [ ! -d "$curated_dir" ]; then
	mkdir "$curated_dir"
fi

for download in "${download_list[@]}"
do
	if [ ! -f "$curated_dir/$download" ]; then
		echo -e "$(date)\t$scriptname\tDownloading $download"
		download_file "$curated_dir" "$chiulab_dir" "$download" "chiulab" "$download.md5"
	else
		echo -e "$(date)\t$scriptname\t$download already present."
	fi
done

echo -e "$(date)\t$scriptname\t${green}Download complete.${endColor}"

加载评论