############################################################################
# Copyright (c) 2011-2016 BIOLS, CAS
# All Rights Reserved
# See file LICENSE for details.
############################################################################

=begin SUMMARY
Main steps:
1. Check and record parameters designated by user
2. Prepare files to process
	key subroutines:
	&split_sam_file &split_transcript
3. First scanning of SAM
	key subroutines:
	&parallel_scan1 &mapping_check_parallel &gtag_pem_repeat
4. Annotation of candidate circRNAs
5. Second scanning of SAM
	key subroutines:
	&parallel_scan2 &circ_read_candidate &read_circ_range
6. Output
=cut

use strict;
use threads;
use Getopt::Long;
use File::Basename;

$| = 1;
my $version = '2.0.5';

### Parameters can be designated by user
my ( $sam, $cRNA_out, $anno_gtf, $ref_dir, $ref_1file, $log, $help, $max_circle, $min_circle, $high, $low, $no_strigency, $min_mapq_uni, $rel_exp, $max_thread, $chrM, $quiet, $output_all );
Getopt::Long::GetOptions (
	'in|I=s'					=>	\$sam,
	'out|O=s'					=>	\$cRNA_out,
	'ref_file|F=s'				=>	\$ref_1file,
	'ref_dir|R=s'				=>	\$ref_dir,
	'anno|A=s'					=>	\$anno_gtf,
	'log|G=s'					=>	\$log,
	'help|H!'					=>	\$help,
	'max_span|S=i'				=>	\$max_circle,
	'high_strigency|high!'		=>	\$high,
	'low_strigency|low!'		=>	\$low,
	'no_strigency|0!'			=>	\$no_strigency,
	'mapq_uni|U=i'				=> \$min_mapq_uni,
	'rel_exp|E=f'				=>	\$rel_exp,
	'chrM|M=s'					=>	\$chrM,
	'thread_num|T=i'			=>	\$max_thread,
	'quiet|Q!'					=>	\$quiet,
	'output_all|D!'			=>	\$output_all,
);
if ( !defined($sam) and !defined($cRNA_out) and !defined($anno_gtf) and !defined($ref_dir) and !defined($log) and !defined($help) and !defined($max_circle) ) {
	print "Please use the --help or -H option to get usage information.\n";
### Show help info to user if requested
} elsif (defined($help)) {
	print '
Program:  CIRI2 (circRNA identifier2)
Version:  2.0.5
Contact:  Yuan Gao <gaoyuan06@mails.ucas.ac.cn>

Usage:    perl CIRI.pl -I in.sam -O output.ciri -F ref.fa (-R ref_dir/)

Arguments:
    -I, --in
          input SAM file name (required; generated by BWA-MEM)
    -O, --out
          output circRNA list name (required)
    -F, --ref_file
          FASTA file of all reference sequences. Please make sure this file is
          the same one provided to BWA-MEM. Either this argument or
          -R/--ref-dir is required.
    -R, --ref_dir
          directory of reference sequence(s). Please make sure fasta files in
          this directory are from the FASTA file(s) provided to BWA-MEM. Either
          this argument or -F/--ref-file is required.
    -A, --anno
          input GTF/GFF3 formatted annotation file name (optional)
    -G, --log
          output log file name (optional)
    -H, --help
          show this help information
    -S, --max_span
          max spanning distance of circRNAs (default: 200000)
    -high, --high_strigency
          use high strigency: only output circRNAs supported by more than 2
          distinct PCC signals (default)
    -low, --low_strigency
          use low strigency: only output circRNAs supported by more than 2
          junction reads
    -0, --no_strigency
          output all circRNAs regardless junction read or PCC signal counts
    -U, --mapq_uni
          set threshold for mappqing quality of each segment of junction reads
          (default: 10; should be within [0,30])
    -E, --rel_exp
          set threshold for relative expression calculated based on counts of
          junction reads and non-junction reads (optional: e.g. 0.1)
    -M, --chrM
          tell CIRI2 the ID of mitochondrion in reference file(s) (default:
          chrM)
    -T, --thread_num
          set number of threads for parallel running (default: 1)
    -Q, --quiet
          keep quiet when running
    -D, --output_all
          keep the temporary files after running (more disk space would be 
          needed)
';
} else {
	my $strigency;
	my $read_length;
	my (%chr_seq, $output_dir, $input_dir);
	my (@die_reason, @warn_reason);
	### A preliminary check on whether an appropriate sam file is provided
	if (!defined($sam)) {
		push @die_reason, "Please use --in or -I option to designate input SAM alignment file!\n";
	} elsif (!-e $sam or !-f $sam) {
		push @die_reason, "No SAM alignment file found at designated directory!\n";
	} else {
		if (rindex($sam, "/") >= 0) {
			$input_dir = substr( $sam, 0, rindex($sam, "/")+1 );
		} else {
			$input_dir = "./";
		}
	}
	### A preliminary check on whether an appropriate output directory is designated
	if (!defined($cRNA_out)) {
		push @die_reason, "Please use --out or -O option to designate output file!\n";
	} else {
		if (-e $cRNA_out and -f $cRNA_out) {
			push @warn_reason, " Output file $cRNA_out already exists and is overwritten.\n";
		}
		if (rindex($cRNA_out, "/") >= 0) {
			$output_dir = substr($cRNA_out, 0, rindex($cRNA_out, "/")+1);
		} else {
			$output_dir = "./";
		}
		if (!-w $output_dir) {
			push @die_reason, "Output file cannot be written in the directory $output_dir!\n";
		}
	}
	
	open STDERR, ">>$output_dir/CIRI2error.log" or warn "CIRI cannot write to error log: $!";
	
	if (!defined($log)) {
		$log = $cRNA_out.'.log';
	}
	### A check on minimum mapping quality designated by user
	if (!defined $min_mapq_uni) {
		$min_mapq_uni = 10;
	} elsif ($min_mapq_uni > 30) {
		push @die_reason, "Threshold for mappqing quality of each segment of junction reads cannot be larger than 30!\n";
	} elsif ($min_mapq_uni < 0) {
		push @die_reason, "Threshold for mappqing quality of each segment of junction reads cannot be smaller than 0!\n";
	}

	if (!defined $chrM) {
		$chrM = 'chrM';
	}
	### Preview the sam and split it according thread requested by user
	my (%chr_length, %split_files_key_read);
	my $if_PE;
	{
		my (%test_read, %read_length_types, %reads);
		open SAM, "<", $sam or die "cannot open the sam file: $!";
		while (<SAM>) {
			chomp;
			if (/^[@]/) {
				if (/^[@]SQ/) {
					my @line = split /\t/;
					my ($chr, $length);
					if ($line[1] =~ /^SN:(.+$)/) {
						$chr = $1;
					} if ($line[2] =~ /^LN:(\d+)/) {
						$length = $1;
					}
					if (defined $chr and defined $length) {
						$chr_length{$chr} = $length;
					}
				}
			} else {
				my @line = split /\t/;
				if ( !exists($test_read{$line[0]}) ) {
					$test_read{$line[0]} = 1;
				}
				push @{$reads{&ten2b($line[1],7)}}, $line[0];
				last if scalar(keys %test_read) >= 200;
			}
		}
		$if_PE = scalar(keys %reads);
		if (!defined $max_thread or $max_thread == 1) {
			$max_thread = 1;
			&split_sam_file($sam, $max_thread);
		} elsif ($max_thread > 32) {
			push @die_reason, "Please do not request thread number more than 32, which would not increase more speed of CIRI.\n";
		}
	}

	my ($gff, $gtf);

	### Stringency designated by user
	if ( defined($high) and defined($low) ) {
		push @die_reason, "Sensitivity cannot be both high and low.\n";
	} elsif (defined($low)) {
		$strigency = 1;
	} elsif (defined($no_strigency)) {
		$strigency = 0;
	} else {
		$strigency = 2;
	}

	### A preliminary check on whether an appropriate annotation file is provided
	my $if_anno;
	if ( defined($anno_gtf) and (!-e $anno_gtf or !-f $anno_gtf) ) {
		push @die_reason, "No annotation file found at designated directory!\n";
	} elsif ( defined($anno_gtf) and !($anno_gtf =~ /\.gff/ or $anno_gtf =~ /\.gtf/) ) {
		push @die_reason, "Please provide .gff or .gtf format as annotation file!\n";
	} elsif ( defined($anno_gtf) and $anno_gtf =~ /\.gff/ ) {
		$gff = 1;
		$if_anno = 1;
	} elsif ( defined($anno_gtf) and $anno_gtf =~ /\.gtf/ ) {
		$gtf = 1;
		$if_anno = 1;
	}

	### If the annotation is formatted gff, further determine the type of gff of the annotation file provided
	if ( defined $gff and $gff == 1 ) {
		my $test_line = 1000;
		my $line_count = 0;
		open ANNO, "<", $anno_gtf or die "cannot open the annotation file: $!";
		while (<ANNO>) {
			chomp;
			$line_count ++;
			my @line = split /\t/;
			if ($line[2] eq 'exon') {
				if ($line[8] =~ /^Parent=\w+\.\w*.*/) {
					$gff ++;
					last;
				} elsif ($line[8] =~ /gene=\w+;.*transcript_id=\w+\.*/) {
					$gff += 2;
					last;
				}
			}
			if ( $line_count >= $test_line and $gff == 1 ) {
				last;
			}
		}
		if ($gff == 1) {
			push @die_reason, "The GFF file provided cannot be understood by CIRI! Please refer to manual for details of required GFF formats\n";
		}
	}
	### Maximum of genomic range designated by user for detected circRNAs
	if ( defined($max_circle) and $max_circle < 10_000 ) {
		push @die_reason, "Max span size on reference cannot be smaller than 10000!\n";
	} elsif (!defined($max_circle)) {
		$max_circle = 200_000;
	}
	$min_circle = 140;
	if ( defined($rel_exp) and $rel_exp > 1 ) {
		push @die_reason, "Relative expression cannot be larger than 1!\n";
	} if ( defined($rel_exp) and $rel_exp < 0 ) {
		push @die_reason, "Relative expression cannot be smaller than 0!\n";
	}
	### Keep temporary files in outputs or not
	unless (defined $output_all){
		$output_all = 'no';
	}
	### Keep quiet when running or not
	unless (defined $quiet){
		$quiet = 'no';
	}
	### Report and quit if any fatal error found.
	if (@die_reason >= 1) {
		print @die_reason;
		print "Fatal error. Aborted.\n";
		die;
	} else {
		### Check whether the reference provided by user is appropriate.
		if ( !defined($ref_dir) and !defined($ref_1file) ) {
			push @die_reason, "Please use --ref-dir or -R option to designate refenece directory for multiple reference files, or use --ref-file or -F to designate one file with all references in!\n";
		} elsif ( defined($ref_dir) and (!-e $ref_dir or !-d $ref_dir) ) {
			push @die_reason, "Reference directory $ref_dir does not exist!\n";
		} elsif ( defined($ref_dir) and !-r $ref_dir ) {
			push @die_reason, "Reference directory $ref_dir is not readable!\n";
		} elsif (defined $ref_dir) {
			open MOD, ">>", $log or die;
			print MOD 	'[', scalar(localtime), "] CIRI begins running\n";
			print 		'[', scalar(localtime), "] CIRI begins running\n" if $quiet eq 'no';
			print MOD 	'[', scalar(localtime), "] Loading reference\n";
			print 		'[', scalar(localtime), "] Loading reference\n" if $quiet eq 'no';
			$ref_dir = substr($ref_dir, 0, length($ref_dir) - 1) if rindex($ref_dir, "/") == length($ref_dir) - 1;
			my @ref_file = <$ref_dir/*.fa $ref_dir/*.fasta>;
			if (@ref_file == 0) {
				push @die_reason, "No fasta file is found in designated refenece directory.\n";
			}
			my @chr_not_found;
			while ( my ($chr, undef) = each %chr_length ) {
				next if $chr eq $chrM;
				my $uni_seq = &read_uni_chr($ref_dir, $chr);
				if (${$uni_seq}[0] == 1){
					$chr_seq{$chr} = ${$uni_seq}[1];
				}elsif (${$uni_seq}[0] == 2){
					push @die_reason, "There are more than one sequence in $chr file. Please check!";
				}else{
					push @chr_not_found, $chr;
				}
			}
			if ( @chr_not_found > 0 ) {
				push @die_reason, "The following chromosomes are not found in $ref_dir or not formatted in FASTA: @chr_not_found\n";
			}
		} elsif ( defined($ref_1file) and (!-e $ref_1file or !-f $ref_1file) ) {
			push @die_reason, "Reference file $ref_1file does not exist!\n";
		} elsif ( defined($ref_1file) and !-r $ref_1file ) {
			push @die_reason, "Reference file $ref_1file is not readable!\n";
		} elsif ( defined($ref_1file) ) {
			open MOD, ">>", $log;	#or die
			print MOD 	'[', scalar(localtime), "] CIRI begins running\n";
			print 		'[', scalar(localtime), "] CIRI begins running\n" if $quiet eq 'no';
			print MOD 	'[', scalar(localtime), "] Loading reference\n";
			print 		'[', scalar(localtime), "] Loading reference\n" if $quiet eq 'no';
			my $tag4ref = 0;
			my $ref_ID;
			my @chr_not_found;
			while ( my ($chr, undef) = each %chr_length ) {
				$chr_seq{$chr} = '';
			}
			open CHR1, "<", $ref_1file or push @die_reason, "cannot open reference file $ref_1file: $!";
			while (<CHR1>) {
				chomp;
				if ( /^>(\S+)/ and exists $chr_seq{$1} ) {
					$ref_ID = $1;
					$tag4ref = 1;
				} elsif (/^>/) {
					$tag4ref = 0;
				} elsif ($tag4ref == 1) {
					$chr_seq{$ref_ID} .= "\U$_";
				}
			}
			while ( my ($chr, undef) = each %chr_length ) {
				if ( length($chr_seq{$chr}) == 0 and $chr ne $chrM ) {
					push @chr_not_found, $chr;
				}
			}
			if (@chr_not_found > 0) {
				push @die_reason, "The following chromosomes are not found in $ref_1file: @chr_not_found\n";
			}
		}
		### Quit if fatal error found.
		if (@die_reason >= 1) {
			print @die_reason;
			print "Fatal error. Aborted.\n";
			die;
		} elsif (@warn_reason >= 1) {
			print @warn_reason;
		}
	}

	### Check if multiple threads are designated by user.
	if ($max_thread >= 2) {
		print MOD 	'[', scalar(localtime), "] Requesting system to split SAM into $max_thread pieces\n";
		print 		'[', scalar(localtime), "] Requesting system to split SAM into $max_thread pieces\n" if $quiet eq 'no';
		### If so, divide the SAM file accordingly and record the names of them.
		my $time_split = &split_sam_file($sam, $max_thread);
	}

	my ($pre_read, $pre_read2, $strand_2nd, $z2, @candidate_reads, @loci_validated, $total_validated, @qualified_cluster);
	my (@PE_reads2);
	my $cluster_num2 = 0;
	my ($asterisk, $all_length_match) = (0, 0);
	my (%chr, %site1, %site2, %sense_strand, %cluster_read, %chr_validated, %chr_end1, %chr_end2, %chr_seq1, %chr_seq2, %seq4read1, %seq4read2, %chr_reads, @chr_cluster, %chr_division1, %chr_division2, @site1_cluster, @site2_cluster, %linear_cp, %string, %chr_seq_length, %MSID_read, %FP_cluster, %cigar_read, %if_1st_read, %seq1_cluster, %seq2_cluster, %chr_division1_reads, %chr_division2_reads, %chr_range1, %chr_range2, %chr_range1_reads, %chr_range2_reads);
	my (%chr_site1_gene_trsc, %chr_site2_gene_trsc);
	my (%chr_site1, %chr_site2, %chr_gene_site1, %chr_gene_site2, %chr_site1_gene, %chr_site2_gene);
	my $cluster_num = 0;
	my %gene_exon_exist;
	my (%gene_exon, %gene_loci, %chr_gene, $if_anno_add, %read_anno, @cluster_anno, %anno_gene, @anno_cluster);
	my $add_validated;
	my ($inter_num, $intron_num, $exon_num);
	my %type;
	my %candidate4mode_check;
	my (%read_in_chr);
	my %tmp_chr;
	my ($PEM_reads, $non_PEM_reads, $wrong_PEM_reads) = (0, 0, 0);
	my (%bsj_reads, %circ, %non_PEM, %FP);
	my ($PEM_more_reads, $non_PEM_more_reads, $wrong_PEM_more_reads) = (0, 0, 0);
	my (%strand_circ, %bibases_circ, @cand_circ_sort, %final_cluster, %cigars_cluster, %exp_cRNA);
	my (%exon_start, %exon_end);
	{
		### If the annotation is provided, scan it to record exon start and end, as well as gene start and end.
		my @gene_anno;
		if ( $if_anno == 1 and $gtf == 1 ) {
			open ANNO, "<", $anno_gtf or die "cannot open the annotation file $anno_gtf: $!";
			my (@gene_anno, $pre_gene);
			while (<ANNO>) {
				chomp;
				my @line = split /\t/;
				if ( defined $line[2] and $line[2] eq 'exon' ) {
					if ( $line[8] =~ /gene_id \"(\S+)\"/ ) {
						my $gene_ID = $1;
						$exon_start{$line[0]}{$line[3]}{$gene_ID} ++;
						$exon_end{$line[0]}{$line[4]}{$gene_ID} ++;
						if ( defined $pre_gene and $pre_gene ne $gene_ID ) {
							&split_transcript($pre_gene, @gene_anno);
							@gene_anno = ();
						}
						push @gene_anno, $_;
						$pre_gene = $gene_ID;
					} else {
						die "CIRI cannot understand $anno_gtf.";
					}
				}
			}
			&split_transcript($pre_gene, @gene_anno);
		} elsif ( $if_anno == 1 and $gff >= 2 ) {
			my $pre_gene;
			if ($gff == 2) {
				open ANNO, "<", $anno_gtf or die "cannot open the annotation file: $!";
				while (<ANNO>) {
					chomp;
					my @line = split /\t/;
					if ( defined $line[2] and $line[2] eq 'exon' ) {
						if ($line[8] =~ /^Parent=((\w+)\.\w*).*/) {
							my $gene_ID = $2;
							$exon_start{$line[0]}{$line[3]}{$gene_ID} ++;
							$exon_end{$line[0]}{$line[4]}{$gene_ID} ++;
							if ( defined $pre_gene and $pre_gene ne $gene_ID ) {
								&split_transcript($pre_gene, @gene_anno);
								@gene_anno = ();
							}
							push @gene_anno, $_;
							$pre_gene = $gene_ID;
						} else {
							die "CIRI cannot understand $anno_gtf.";
						}
					}
				}
				&split_transcript($pre_gene, @gene_anno);
			} elsif ($gff == 3) {
				open ANNO, "<", $anno_gtf or die "cannot open the annotation file: $!";
				while (<ANNO>) {
					chomp;
					my @line = split /\t/;
					if ( defined $line[2] and $line[2] eq 'exon' ) {
						if ($line[8] =~ /;gene=(\w+)/) {
							my $gene_ID = $1;
							$exon_start{$line[0]}{$line[3]}{$gene_ID} ++;
							$exon_end{$line[0]}{$line[4]}{$gene_ID} ++;
							if ( defined $pre_gene and $pre_gene ne $gene_ID ) {
								&split_transcript($pre_gene, @gene_anno);
								@gene_anno = ();
							}
							push @gene_anno, $_;
							$pre_gene = $gene_ID;
						} else {
							die "CIRI cannot understand $anno_gtf.";
						}
					}
				}
				&split_transcript($pre_gene, @gene_anno);
			}
		}
	}
	%gene_exon_exist = ();
	open CRNAOUT, ">", $cRNA_out or die "CIRI cannot write $cRNA_out: $!";
	open SAM, "<", $sam or die "cannot open the sam file: $!";
	print MOD 	'[', scalar(localtime), "] First scanning\n";
	print 		'[', scalar(localtime), "] First scanning\n" if $quiet eq 'no';

	### first scanning of sam
	###	pass mapping records of a pair of reads/single read to &mapping_check_parallel and check if they are candidates of bsj reads
	### record results in tmp1 files

	if ($max_thread >= 1) {
		my @ths;
		while (my ($file, $key_read) = each %split_files_key_read) {
			my $th = threads -> new({'context' => 'void'}, \&parallel_scan1, [$file, $key_read]);
			my $th_id = $th->tid();
			#print " Worker $th_id begins to scan $file.\n" if $quiet eq 'no';
			push @ths, $th;
		}
		for (@ths) {
			my $th_id = $_->tid();
			$_ -> join();
			#print " Worker $th_id finished reporting.\n" if $quiet eq 'no';
		}
		@ths = ();
	}

	### scanning of tmp1 files
	###	give summary of candidate BSJ reads and other reads

	{
		while ( my ($file, $key_read) = each %split_files_key_read ) {
			open IN, "<", $output_dir.$file.'.list' or die "cannot open tmp $output_dir$file.list: $!";
			while (<IN>) {
				chomp;
				my ( $read_name, $tag, $chr, $cigar1, $cigar2, $cigar3, $start, $end, $strand, $bibases1, $bibases2 ) = split (/\t/, $_);
				my $circ = $chr.':'.$start.'|'.$end;
				if ($tag == 1) {
					$PEM_reads ++;
				} elsif ($tag == -1) {
					$non_PEM_reads ++;
				} elsif ($tag == -2) {
					$wrong_PEM_reads ++;
				}
				unless (exists $strand_circ{$circ}) {
					$strand_circ{$circ} = $strand;
					$bibases_circ{$circ} = [$bibases1, $bibases2];
				}
				$cigar_read{$read_name} = [$cigar1, $cigar2, $cigar3];
			}
			close IN;
		}
		print MOD 	" Candidate reads with splicing signals: ".($PEM_reads+$non_PEM_reads+$wrong_PEM_reads)."\n";
		print 		" Candidate reads with splicing signals: ".($PEM_reads+$non_PEM_reads+$wrong_PEM_reads)."\n" if $quiet eq 'no';
		print MOD 	" Candidate reads with PEM signals: ", $PEM_reads, "\n";
		print 		" Candidate reads with PEM signals: ", $PEM_reads, "\n" if $quiet eq 'no';
		print MOD 	" Candidate circRNAs found: ", scalar(keys %strand_circ), "\n";
		print 		" Candidate circRNAs found: ", scalar(keys %strand_circ), "\n" if $quiet eq 'no';
		my (%circ_start, %circ_end, %circ_chr, $pre_chr, @sort_chr_gene, $start_gene_index);
		while ( my ($cand_circ, undef) = each %strand_circ ) {
			my ($chr, $start, $end) = split /[:|]/, $cand_circ;
			$circ_chr{$cand_circ} = $chr;
			$circ_start{$cand_circ} = $start;
			$circ_end{$cand_circ} = $end;
		}

		### Sort and annotate candidate circRNAs according to their positions
		###	three categories of candidate circRNAs: exon, intron, intergenic
		###	the first two categories are also recorded for the corresponding gene ID

		@cand_circ_sort = sort {$circ_chr{$a} cmp $circ_chr{$b} or $circ_start{$a} <=> $circ_start{$b} or $circ_end{$a} <=> $circ_end{$b}} (keys %strand_circ);
		for my $i (0 .. $#cand_circ_sort) {
			$site1_cluster[$i] = $circ_start{$cand_circ_sort[$i]};
			$site2_cluster[$i] = $circ_end{$cand_circ_sort[$i]};
			$chr_cluster[$i] = $circ_chr{$cand_circ_sort[$i]};
			my $division1 = int($site1_cluster[$i]/500);
			my $division2 = int($site2_cluster[$i]/500);
			for (-6 .. 6) {
				my $range1 = "$chr_cluster[$i]:".( $site1_cluster[$i]+$_ );
				my $range2 = "$chr_cluster[$i]:".( $site2_cluster[$i]+$_ );
				push @{$chr_range1{ $range1 }}, $i;
				push @{$chr_range2{ $range2 }}, $i;
			}
			push @{$chr_division1{ "$chr_cluster[$i]:$division1" }}, $i;
			push @{$chr_division2{ "$chr_cluster[$i]:$division2" }}, $i;
			if ($if_anno == 1) {
				if ( (!defined $pre_chr or $chr_cluster[$i] ne $pre_chr) and exists $gene_loci{$chr_cluster[$i]} ) {
					@sort_chr_gene = sort { $gene_loci{$chr_cluster[$i]}{$a}[0] <=> $gene_loci{$chr_cluster[$i]}{$b}[0] or $gene_loci{$chr_cluster[$i]}{$a}[1] <=> $gene_loci{$chr_cluster[$i]}{$b}[1] } (keys %{$gene_loci{$chr_cluster[$i]}});
					$start_gene_index = 0;
				}
				$pre_chr = $chr_cluster[$i];
				my $tag = 0;
				if ( exists $exon_start{$chr_cluster[$i]}{$site1_cluster[$i]} and exists $exon_end{$chr_cluster[$i]}{$site2_cluster[$i]} ) {
					while ( my ($start_gene, undef) = each %{$exon_start{$chr_cluster[$i]}{$site1_cluster[$i]}} ) {
						while ( my ($end_gene, undef) = each %{$exon_end{$chr_cluster[$i]}{$site2_cluster[$i]}} ) {
							if ($start_gene eq $end_gene) {
								push @{$anno_cluster[$i]}, $start_gene;
								$cluster_anno[$i] = 'exon';
								$tag = 1;
							}
						}
					}
				}
				if ( $tag == 0 and exists $gene_loci{$chr_cluster[$i]} ) {
					my @tmp_genes;
					for my $j ( $start_gene_index .. $#sort_chr_gene ) {
						if ( $site1_cluster[$i] >= $gene_loci{$chr_cluster[$i]}{$sort_chr_gene[$j]}[0] and $site2_cluster[$i] <= $gene_loci{$chr_cluster[$i]}{$sort_chr_gene[$j]}[1] ) {
							$tag = -1;
							push @tmp_genes, $sort_chr_gene[$j];
						}
					}
					if ($tag == -1) {
						for my $gene (@tmp_genes) {
							my @sort_gene_exon = sort { ${$a}[0] <=> ${$b}[0] or ${$a}[1] <=> ${$b}[1] } (@{$gene_exon{$gene}});
							my ($if_start_ok, $if_end_ok);
							for my $j (0 .. $#sort_gene_exon) {
								if ( $sort_gene_exon[$j][0] <= $site1_cluster[$i] and $sort_gene_exon[$j][1] >= $site1_cluster[$i] ) {
									$if_start_ok = 1;
								} if ( $sort_gene_exon[$j][0] <= $site2_cluster[$i] and $sort_gene_exon[$j][1] >= $site2_cluster[$i] ) {
									$if_end_ok = 1;
								}
							}
							if ( defined $if_start_ok and defined $if_end_ok ) {
								push @{$anno_cluster[$i]}, $gene;
								$cluster_anno[$i] = 'exon';
								$tag = 1;
							}
						}
						unless ($tag == 1) {
							$cluster_anno[$i] = 'intron';
							@{$anno_cluster[$i]} = @tmp_genes;
						}
					}
					unless ( defined $anno_cluster[$i] and @{$anno_cluster[$i]} > 0 ) {
						$cluster_anno[$i] = 'intergenic_region';
					}
				} else {
					$cluster_anno[$i] = 'intergenic_region' unless $tag == 1;
				}
			}
		}
	}
	print MOD 	'[', scalar(localtime), "] Second scanning\n";
	print 		'[', scalar(localtime), "] Second scanning\n" if $quiet eq 'no';

	### second scanning of SAM
	### process each divided SAM by each thread in &parallel_scan2 and record results in tmp2 files

	for my $th (threads->list()) {
		print "$th\n" if $quiet eq 'no';
	}
	if ($max_thread >= 1) {
		my @ths;
		while ( my ($file, $key_read) = each %split_files_key_read ) {
			my $th = threads -> new({'context' => 'void'}, \&parallel_scan2, [$file, $key_read]);
			my $th_id = $th->tid();
			#print " Worker $th_id begins to scan $file.\n" if $quiet eq 'no';
			push @ths, $th;
		}
		for my $th (@ths) {
			my $th_id = $th->tid();
			$th -> join();
			#print " Worker $th_id finished reporting.\n" if $quiet eq 'no';
		}
		@ths = ();
	}
	print MOD 	'[', scalar(localtime), "] Extracting info from temporary files\n";
	print 		'[', scalar(localtime), "] Extracting info from temporary files\n" if $quiet eq 'no';
	{
		%chr_seq = ();
		while ( my ($file, $key_read) = each %split_files_key_read ) {
			open IN, "<", $output_dir.$file.'.list' or die "cannot open tmp $output_dir$file.list: $!";
			while (<IN>) {
				chomp;
				my ($read_name, $tag, $chr, $cigar1, $cigar2, $cigar3, $start, $end, $strand, $bibases1, $bibases2) = split (/\t/, $_);
				my $circ = $chr.':'.$start.'|'.$end;
				if ($tag == -1) {
					$non_PEM{$read_name} = $circ;
				} elsif ($tag == -2) {
					$FP{$read_name} = $circ;
				}
				push @{$bsj_reads{$circ}}, $read_name;
			}
			close IN;
		}
		while ( my ($file, $key_read) = each %split_files_key_read ) {
			open IN, "<", $output_dir.$file.'.list2' or die "cannot open tmp $output_dir$file.list2: $!";
			while (<IN>) {
				chomp;
				my @line = split /\t/;
				if ($line[1] == 0) {
					for (2 .. $#line) {
						$linear_cp{$line[$_]} ++;
					}
				} else {
					my ($read_name, $bingo_x, $pinhead, $line5) = @line[0, 2, 3, 4];
					if ($line[1] == 1) {
						$PEM_more_reads ++;
					} elsif ($line[1] == -1) {
						$non_PEM_more_reads ++;
						$non_PEM{$read_name} = $bingo_x;
					} elsif ($line[1] == -2) {
						$wrong_PEM_more_reads ++;
						$FP{$read_name} = $bingo_x;
					}
					push @{$bsj_reads{$cand_circ_sort[$bingo_x]}}, $read_name;
					$cigar_read{$read_name}[$pinhead-1] = $line5;
				}
			}
			close IN;
		}
	}
		print MOD 	" Additional candidate reads found: ".($PEM_more_reads+$non_PEM_more_reads+$wrong_PEM_more_reads)."\n";
		print 		" Additional candidate reads found: ".($PEM_more_reads+$non_PEM_more_reads+$wrong_PEM_more_reads)."\n" if $quiet eq 'no';

	if ($if_PE == 2) {
		print MOD 	" Additional candidate reads with PEM signals: ", $PEM_more_reads, "\n";
		print 		" Additional candidate reads with PEM signals: ", $PEM_more_reads, "\n" if $quiet eq 'no';
	}
	print MOD 	'[', scalar(localtime), "] Summarizing\n";
	print 		'[', scalar(localtime), "] Summarizing\n" if $quiet eq 'no';

	### scanning of tmp2 files
	### summarize candidate circRNAs and output confident ones according to count of reads with PEM signals

		for my $j (0 .. $#cand_circ_sort) {
			my $clusterID = $cand_circ_sort[$j];
			my (@FP_reads, @TP_reads, @non_reads);
			my %cigar_cluster;
			my %false_cigar_cluster;

			for (@{$bsj_reads{$clusterID}}) {
				if (exists $FP{$_}) {
					push @FP_reads, $_;
					for my $n (0 .. 2) {
						$false_cigar_cluster{$n}{$cigar_read{$_}[$n]}++ unless (!defined $cigar_read{$_}[$n] or $cigar_read{$_}[$n] eq '');
					}
				} elsif (exists $non_PEM{$_}) {
					push @non_reads, $_;
					for my $n (0 .. 2) {
						$false_cigar_cluster{$n}{$cigar_read{$_}[$n]}++ unless (!defined $cigar_read{$_}[$n] or $cigar_read{$_}[$n] eq '');
					}
				} else {
					push @TP_reads, $_;
					for my $n (0 .. 2) {
						$cigar_cluster{$n}{$cigar_read{$_}[$n]}++ unless (!defined $cigar_read{$_}[$n] or $cigar_read{$_}[$n] eq '');
					}
				}
			}
			my %CIGAR_count;
			for my $cigar_position (0 .. 2) {
				if ( exists $cigar_cluster{$cigar_position} and scalar(keys %{$cigar_cluster{$cigar_position}}) > 0 ) {
					$CIGAR_count{$cigar_position} = scalar(keys %{$cigar_cluster{$cigar_position}});
				} else {
					$CIGAR_count{$cigar_position} = 0;
				}
			}
			my %false_CIGAR_count;
			for my $cigar_position (0 .. 2) {
				if ( exists $false_cigar_cluster{$cigar_position} and scalar(keys %{$false_cigar_cluster{$cigar_position}}) > 0 ) {
					$false_CIGAR_count{$cigar_position} = scalar(keys %{$false_cigar_cluster{$cigar_position}});
				} else {
					$false_CIGAR_count{$cigar_position} = 0;
				}
			}
			if ( $strigency == 2 and (@TP_reads > 19*@FP_reads or @FP_reads <=1) and @TP_reads>@non_reads+@FP_reads and $CIGAR_count{"0"}+$CIGAR_count{"1"}+$CIGAR_count{"2"}>=3 ) {
				$cluster_num2 ++;
				$final_cluster{$cluster_num2} = $j;
				$cigars_cluster{$j} = $CIGAR_count{"0"}."_".$CIGAR_count{"1"}."_".$CIGAR_count{"2"};
				$exp_cRNA{$j} = \@TP_reads;
			} elsif ( $strigency == 1 and (@TP_reads > 19*@FP_reads or $false_CIGAR_count{"0"}+$false_CIGAR_count{"1"}+$false_CIGAR_count{"2"}<=2) and @TP_reads>@non_reads+@FP_reads and @TP_reads >= 2 ) {
				$cluster_num2 ++;
				$final_cluster{$cluster_num2} = $j;
				$cigars_cluster{$j} = $CIGAR_count{"0"}."_".$CIGAR_count{"1"}."_".$CIGAR_count{"2"};
				$exp_cRNA{$j} = \@TP_reads;
			} elsif ( $strigency == 0 and (@TP_reads > 19*@FP_reads or $false_CIGAR_count{"0"}+$false_CIGAR_count{"1"}+$false_CIGAR_count{"2"}<=2) and @TP_reads>@non_reads+@FP_reads ) {
				$cluster_num2 ++;
				$final_cluster{$cluster_num2} = $j;
				$cigars_cluster{$j} = $CIGAR_count{"0"}."_".$CIGAR_count{"1"}."_".$CIGAR_count{"2"};
				$exp_cRNA{$j} = \@TP_reads;
			}
		}
	### outputs are different according to whether annotation is provided by user
	my ($removed4rel_exp) = (0);
	if ($if_anno == 1) {
		print CRNAOUT "circRNA_ID\tchr\tcircRNA_start\tcircRNA_end\t#junction_reads\tSM_MS_SMS\t#non_junction_reads\tjunction_reads_ratio\tcircRNA_type\tgene_id\tstrand\tjunction_reads_ID\n";
		for my $i (1 .. $cluster_num2) {
			my $j = $final_cluster{$i};
			if ( defined($rel_exp) and @{ $exp_cRNA{$j} } < $linear_cp{$j}*$rel_exp/2 ) {
				$removed4rel_exp ++;
				next;
			}
			print CRNAOUT "$chr_cluster[$j]:$site1_cluster[$j]|$site2_cluster[$j]\t$chr_cluster[$j]\t$site1_cluster[$j]\t$site2_cluster[$j]\t".scalar(@{$exp_cRNA{$j}})."\t$cigars_cluster{$j}\t";
			if ( exists $linear_cp{$j} and $linear_cp{$j}>0 ) {
				print CRNAOUT $linear_cp{$j};
				printf CRNAOUT "\t%.3f", (@{ $exp_cRNA{$j} }/( $linear_cp{$j}/2 + @{$exp_cRNA{$j}} ));
			} else {
				print CRNAOUT "0\t1";
			}
			print CRNAOUT "\t$cluster_anno[$j]\t";
			if (defined $anno_cluster[$j]) {
				print CRNAOUT "$_," for @{$anno_cluster[$j]};
				print CRNAOUT "\t";
			} else {
				print CRNAOUT "n/a\t";
			}
			print CRNAOUT $strand_circ{"$chr_cluster[$j]:$site1_cluster[$j]|$site2_cluster[$j]"}, "\t";
			print CRNAOUT "$_," for @{$exp_cRNA{$j}};
			print CRNAOUT "\n";
		}
	} else {
		print CRNAOUT "circRNA_ID\tchr\tcircRNA_start\tcircRNA_end\t#junction_reads\tSM_MS_SMS\t#non_junction_reads\tjunction_reads_ratio\tcircRNA_type\tgene_id\tstrand\tjunction_reads_ID\n";
		for my $i (1 .. $cluster_num2) {
			my $j = $final_cluster{$i};
			if ( defined($rel_exp) and @{ $exp_cRNA{$j} } < $linear_cp{$j}*$rel_exp/2 ) {
				$removed4rel_exp ++;
				next;
			}
			print CRNAOUT "$chr_cluster[$j]:$site1_cluster[$j]|$site2_cluster[$j]\t$chr_cluster[$j]\t$site1_cluster[$j]\t$site2_cluster[$j]\t".scalar(@{$exp_cRNA{$j}})."\t$cigars_cluster{$j}\t";
			if ( exists $linear_cp{$j} and $linear_cp{$j}>0 ) {
				print CRNAOUT $linear_cp{$j};
				printf CRNAOUT "\t%.3f\tn/a\t/n/a\t", (@{ $exp_cRNA{$j} }/( $linear_cp{$j}/2 + @{$exp_cRNA{$j}} ));
			} else {
				print CRNAOUT "0\t1\tn/a\t/n/a\t";
			}
			print CRNAOUT $strand_circ{"$chr_cluster[$j]:$site1_cluster[$j]|$site2_cluster[$j]"}, "\t";
			print CRNAOUT "$_," for @{$exp_cRNA{$j}};
			print CRNAOUT "\n";
		}
	}
	print 		" Number of circular RNAs found: ", ($cluster_num2-$removed4rel_exp), "\n" if $quiet eq 'no';
	print MOD 	" Number of circular RNAs found: ", ($cluster_num2-$removed4rel_exp), "\n";
	### remove temporary files and say byebye
	if($output_all eq 'no'){
		while ( my ($file, $key_read) = each %split_files_key_read ) {
			system "rm $output_dir$file" if $max_thread >=2;
			system "rm $output_dir$file.list";
			system "rm $output_dir$file.list2";
		}
	}
	print MOD 	'[', scalar(localtime), "] CIRI finished its work. Please see output file $cRNA_out for detail.\n";
	print 		'[', scalar(localtime), "] CIRI finished its work. Please see output file $cRNA_out for detail.\n" if $quiet eq 'no';
	#print MOD 	"-" x 80, "\n";
	
	### The subroutine to record single reference from a FASTA file
	### Return test result and sequence of the reference
	### input: 1.$_[0]: directory of reference; 2.$_[1]: chromosome ID

	sub read_uni_chr {
		my $tag = 0;
		my $seq = '';
		my ($ref_dir, $chr) = @_;
		open CHR, "<", $ref_dir."/".$chr.'.fa' or open CHR, "<", $ref_dir."/".$chr.'.fasta';
		while (<CHR>) {
			chomp;
			if ( /^>/ and $tag == 0 ) {
				$tag = 1;
			} elsif (/^>/) {
				$tag = 2;
				last;
			} else {
				$seq .= "\U$_";
			}
		}
		[$tag, $seq];
	}

	### The subroutine to identify candidate BSJ read during the 1st scanning of divided SAM
	### Reads with PCC signals are passed to gtag_pem_repeat for multiple seed matching
	### Two types of PCC signals can be detected: two-segment and multiple-segment
	### input: 1.$_[0]: read name; 2.$_[1 .. n]: local alignments of the read

	sub mapping_check_parallel {
		my $read_name = shift;
		my (@reads, %read_seq, $tag);
		### for each alignment of the read pair, decide its orignal read (1st read or 2nd read) according to FLAG colomn in SAM
		### record the alignment into hash %read_seq with orignal read as key and strand and sequence as value
		for my $k (@_) {
			my @line = split (/\t/, $k);
			if (&ten2b($line[1],7) == 1) {
				push @{$reads[1]}, $k;
				if ( @_ >= $if_PE+1 and !exists $read_seq{'1'} ) {
					$read_seq{'1'} = [&ten2b($line[1],5), $line[9]];
				}
			} else {
				push @{$reads[0]}, $k;
				if ( @_ >= $if_PE+1 and !exists $read_seq{'0'} ) {
					$read_seq{'0'} = [&ten2b($line[1],5), $line[9]];
				}
			}
		}
		if (@_ >= $if_PE+1) {
			READ: for my $n(0 .. 1) {
				### for each read of the pair, compare alignments pairwise for potential PCC signal
				for my $i (0 .. $#{$reads[$n]}-1) {
					for my $j ($i+1 .. $#{$reads[$n]}) {
						my (%line, %chr, %mapq, %pos, %cigar, %strand, %msid, $str1, $str2, $str3, $str4);
						for ($i, $j) {
							@{$line{$_}} = split /\t/, ${$reads[$n]}[$_];
							($chr{$_}, $pos{$_}, $mapq{$_}, $cigar{$_}, $strand{$_}, $msid{$_}) = (${$line{$_}}[2], ${$line{$_}}[3], ${$line{$_}}[4], ${$line{$_}}[5], &ten2b(${$line{$_}}[1], 5), &MSID(${$line{$_}}[5], length(${$read_seq{$n}}[1])));
						}
						if ( $chr{$i} eq $chr{$j} and $strand{$i} == $strand{$j} and $chr{$i} ne $chrM ) {
							my($x, $y) = sort {${$msid{$a}}[0] <=> ${$msid{$b}}[0]} ($i, $j);
							### Two-segmment
							### CIGAR values reflecting potential PCC signal in the form of upstream xS/HyM and downstream xMyS/H, where x and y represent the number of mapping (M), soft clipping (S) or hard clipping (H) bases.
							if ( ${$msid{$x}}[0]*${$msid{$y}}[0] == -1 ) {
								my $cir_scale = ${$msid{$x}}[0]*($pos{$x}+${$msid{$x}}[2])+${$msid{$y}}[0]*($pos{$y}+${$msid{$y}}[2]);
								if ( $cir_scale > 0 and abs(${$msid{$x}}[1]-${$msid{$y}}[1])<=6 and $cir_scale <= $max_circle and $cir_scale >= $min_circle and ($mapq{$x}>=$min_mapq_uni or $mapq{$y}>=$min_mapq_uni) ) {
									### adjust the putative boundaris of the potential BSJ junction
									my $end_adjustment1 = int((${$msid{$x}}[1]*${$msid{$x}}[0]+${$msid{$y}}[1]*${$msid{$y}}[0])/2);
									my $end_adjustment2 = ${$msid{$x}}[1]*${$msid{$x}}[0]+${$msid{$y}}[1]*${$msid{$y}}[0] - $end_adjustment1;
									### extract the corresponding sequence of each segment in the read
									if ( $strand{$x} == ${$read_seq{$n}}[0] ) {
										$str2 = substr(${$read_seq{$n}}[1], 0, ${$msid{$x}}[1]+$end_adjustment1);
										$str1 = substr(${$read_seq{$n}}[1], ${$msid{$x}}[1]+$end_adjustment1);
									} else {
										$str2 = substr(&comp_rev(${$read_seq{$n}}[1]), 0, ${$msid{$x}}[1]+$end_adjustment1);
										$str1 = substr(&comp_rev(${$read_seq{$n}}[1]), ${$msid{$x}}[1]+$end_adjustment1);
									}
									if ( $read_seq{1-$n}[0] != $strand{$x} ) {
										$str4 = $read_seq{1-$n}[1];
									} else {
										$str4 = &comp_rev( $read_seq{1-$n}[1] );
									}
									my $str4_ok = 0;
									for my $k2 (@{$reads[1-$n]}) {
										my @line2 = split /\t/, $k2;
										my $MSID2 = &MSID($line2[5], length(${$read_seq{1-$n}}[1]));
										if ( $line2[2] eq $chr{$x} and $line2[4] >= $min_mapq_uni ) {
											if ( &ten2b($line2[1], 5) != $strand{$x} and $line2[3]>=$pos{$x}+$end_adjustment1-6 and $line2[3]+${$MSID2}[-1] <= $pos{$y}+${$msid{$y}}[-1]-$end_adjustment2+6 ) {
												$str4_ok = 1;
												last;
											} else {
												$str4_ok = -1;
												last;
											}
										}
									}
									### pass the info of the read to gtag_pem_repeat for further determination
									if ( $mapq{$x}>=$min_mapq_uni and $mapq{$y}>=$min_mapq_uni ) {
										$tag = &gtag_pem_repeat($chr{$x}, $strand{$x}, $str1, $str2, $str4, 1, 1, $str4_ok, $pos{$x}+$end_adjustment1, $pos{$y}+${$msid{$y}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2);
									} elsif ($mapq{$x}>=$min_mapq_uni) {
										$tag = &gtag_pem_repeat($chr{$x}, $strand{$x}, $str1, $str2, $str4, 1, 0, $str4_ok, $pos{$x}+$end_adjustment1, $pos{$y}+${$msid{$y}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2);
									} elsif ($mapq{$y}>=$min_mapq_uni) {
										$tag = &gtag_pem_repeat($chr{$x}, $strand{$x}, $str1, $str2, $str4, 0, 1, $str4_ok, $pos{$x}+$end_adjustment1, $pos{$y}+${$msid{$y}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2);
									}
									### return the result of determination
									if (${$tag}[0] != 0) {
										return [${$tag}[0], $chr{$x}, $cigar{$x}, $cigar{$y}, '', ${$tag}[1], ${$tag}[2], ${$tag}[3], ${${$tag}[4]}[0], ${${$tag}[4]}[1]];
									}
								}
							### Multiple-segmment
							### CIGAR values reflecting potential PCC signal in the form of xS/HyMzS/H and corresponding (x + y)S/HzM and/or xM(y + z)S/H, where x, y and z represent the number of mapping (M), soft clipping (S) or hard clipping (H) bases.
							} elsif ( abs(${$msid{$x}}[0]*${$msid{$y}}[0]) == 10 ) {
								### for potential PCC signal in the form of xS/HyMzS/H and corresponding (x + y)S/HzM
								if (${$msid{$x}}[0] == -1) {
									my $cir_scale = $pos{$y}+${$msid{$y}}[-1]-1-$pos{$x};
									if ( $cir_scale > 0 and abs(length(${$read_seq{$n}}[1])-${$msid{$y}}[2]-${$msid{$x}}[1])<=6 and $cir_scale <= $max_circle and $cir_scale >= $min_circle and ($mapq{$x}>=$min_mapq_uni or $mapq{$y}>=$min_mapq_uni) ) {
										### adjust the putative boundaris of the potential BSJ junction
										my $end_adjustment1 = int((${$msid{$y}}[1]+${$msid{$y}}[-1]-${$msid{$x}}[1])/2);
										my $end_adjustment2 = ${$msid{$y}}[1]+${$msid{$y}}[-1]-${$msid{$x}}[1]-$end_adjustment1;
										### extract the corresponding sequence of each segment in the read
										if ($strand{$x} == ${$read_seq{$n}}[0]) {
											$str1 = substr( ${$read_seq{$n}}[1], ${$msid{$x}}[1]+$end_adjustment1 );
											$str2 = substr( ${$read_seq{$n}}[1], ${$msid{$y}}[1], ${$msid{$x}}[1]+$end_adjustment1-${$msid{$y}}[1] );
											$str3 = substr( ${$read_seq{$n}}[1], 0, ${$msid{$y}}[1] );
										} else {
											$str1 = substr( &comp_rev(${$read_seq{$n}}[1]), ${$msid{$x}}[1]+$end_adjustment1 );
											$str2 = substr( &comp_rev(${$read_seq{$n}}[1]), ${$msid{$y}}[1], ${$msid{$x}}[1]+$end_adjustment1-${$msid{$y}}[1] );
											$str3 = substr( &comp_rev(${$read_seq{$n}}[1]), 0, ${$msid{$y}}[1] );
										}
										if ( $read_seq{1-$n}[0] != $strand{$x} ) {
											$str4 = $read_seq{1-$n}[1];
										} else {
											$str4 = &comp_rev( $read_seq{1-$n}[1] );
										}
										my $str4_ok = 0;
										for my $k2 ( @{$reads[1-$n]} ) {
											my @line2 = split /\t/, $k2;
											my $MSID2 = &MSID($line2[5], length(${$read_seq{1-$n}}[1]));
											if ( $line2[2] eq $chr{$x} and $line2[4] >= $min_mapq_uni ) {
												if ( &ten2b($line2[1], 5) != $strand{$x} and $line2[3]>=$pos{$x}+$end_adjustment1-6 and $line2[3]+${$MSID2}[-1] <= $pos{$y}+${$msid{$y}}[-1]-$end_adjustment2+6 ) {
													$str4_ok = 1;
													last;
												} else {
													$str4_ok = -1;
													last;
												}
											}
										}
										### pass the info of the read to gtag_pem_repeat for further determination
										if ( $mapq{$x}>=$min_mapq_uni and $mapq{$y}>=$min_mapq_uni ) {
											$tag = &gtag_pem_repeat( $chr{$x}, $strand{$x}, $str1, $str2, $str3, $str4, 1, 1, $str4_ok, $pos{$x}+$end_adjustment1, $pos{$y}+${$msid{$y}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2 );
										} elsif ($mapq{$x}>=$min_mapq_uni) {
											$tag = &gtag_pem_repeat( $chr{$x}, $strand{$x}, $str1, $str2, $str3, $str4, 1, 0, $str4_ok, $pos{$x}+$end_adjustment1, $pos{$y}+${$msid{$y}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2 );
										} elsif ($mapq{$y}>=$min_mapq_uni) {
											$tag = &gtag_pem_repeat( $chr{$x}, $strand{$x}, $str1, $str2, $str3, $str4, 0, 1, $str4_ok, $pos{$x}+$end_adjustment1, $pos{$y}+${$msid{$y}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2 );
										}
										### return the result of determination
										if (${$tag}[0] != 0) {
											return [ ${$tag}[0], $chr{$x}, $cigar{$x}, '', $cigar{$y}, ${$tag}[1], ${$tag}[2], ${$tag}[3], ${${$tag}[4]}[0], ${${$tag}[4]}[1] ];
										}
									}
								### for potential PCC signal in the form of xS/HyMzS/H and corresponding xM(y + z)S/H
								} else {
									my $cir_scale = $pos{$x}+${$msid{$x}}[-1]-1-$pos{$y};
									if ( $cir_scale > 0 and abs(${$msid{$x}}[1]-${$msid{$y}}[1])<=6 and $cir_scale <= $max_circle and $cir_scale >= $min_circle and ($mapq{$x}>=$min_mapq_uni or $mapq{$y}>=$min_mapq_uni) ) {
										### adjust the putative boundaris of the potential BSJ junction
										my $end_adjustment1 = int((${$msid{$x}}[1]-${$msid{$y}}[1])/2);
										my $end_adjustment2 = ${$msid{$x}}[1]-${$msid{$y}}[1] - $end_adjustment1;
										### extract the corresponding sequence of each segment in the read
										if ($strand{$x} == ${$read_seq{$n}}[0]) {
											$str2 = substr( ${$read_seq{$n}}[1], 0, ${$msid{$x}}[1]-$end_adjustment2 );
											$str1 = substr( ${$read_seq{$n}}[1], ${$msid{$x}}[1]-$end_adjustment2, length(${$read_seq{$n}}[1])-${$msid{$y}}[2]-${$msid{$x}}[1]+$end_adjustment2 );
											$str3 = substr( ${$read_seq{$n}}[1], length(${$read_seq{$n}}[1])-${$msid{$y}}[2] );
										} else {
											$str2 = substr( &comp_rev(${$read_seq{$n}}[1]), 0, ${$msid{$x}}[1]-$end_adjustment2 );
											$str1 = substr( &comp_rev(${$read_seq{$n}}[1]), ${$msid{$x}}[1]-$end_adjustment2, length(${$read_seq{$n}}[1])-${$msid{$y}}[2]-${$msid{$x}}[1]+$end_adjustment2 );
											$str3 = substr( &comp_rev(${$read_seq{$n}}[1]), length(${$read_seq{$n}}[1])-${$msid{$y}}[2] );
										}
										if ( $read_seq{1-$n}[0] != $strand{$x} ) {
											$str4 = $read_seq{1-$n}[1];
										} else {
											$str4 = &comp_rev( $read_seq{1-$n}[1] );
										}
										my $str4_ok = 0;
										for my $k2(@{$reads[1-$n]}) {
											my @line2 = split /\t/, $k2;
											my $MSID2 = &MSID( $line2[5], length(${$read_seq{1-$n}}[1]) );
											if ( $line2[2] eq $chr{$x} and $line2[4] >= $min_mapq_uni ) {
												if ( &ten2b($line2[1], 5) != $strand{$x} and $line2[3]>=$pos{$y}+$end_adjustment1-6 and $line2[3]+${$MSID2}[-1] <= $pos{$x}+${$msid{$x}}[-1]-$end_adjustment2+6 ) {
													$str4_ok = 1;
													last;
												} else {
													$str4_ok = -1;
													last;
												}
											}
										}
										### pass the info of the read to gtag_pem_repeat for further determination
										if ( $mapq{$x}>=$min_mapq_uni and $mapq{$y}>=$min_mapq_uni ) {
											$tag = &gtag_pem_repeat( $chr{$x}, $strand{$x}, $str1, $str2, $str3, $str4, 1, 1, $str4_ok, $pos{$y}+$end_adjustment1, $pos{$x}+${$msid{$x}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2 );
										} elsif ($mapq{$y}>=$min_mapq_uni) {
											$tag = &gtag_pem_repeat( $chr{$x}, $strand{$x}, $str1, $str2, $str3, $str4, 1, 0, $str4_ok, $pos{$y}+$end_adjustment1, $pos{$x}+${$msid{$x}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2 );
										} elsif ($mapq{$x}>=$min_mapq_uni) {
											$tag = &gtag_pem_repeat($chr{$x}, $strand{$x}, $str1, $str2, $str3, $str4, 0, 1, $str4_ok, $pos{$y}+$end_adjustment1, $pos{$x}+${$msid{$x}}[-1]-1-$end_adjustment2, $end_adjustment1, $end_adjustment2 );
										}
										### return the result of determination
										if (${$tag}[0] != 0) {
											return [ ${$tag}[0], $chr{$x}, '', $cigar{$x}, $cigar{$y}, ${$tag}[1], ${$tag}[2], ${$tag}[3], ${${$tag}[4]}[0], ${${$tag}[4]}[1] ];
										}
									}
								}
							}
						}
					}
				}
			}
		}
		if ( !defined $tag or ${$tag}[0] == 0 ) {
			[0];
		}
	}

	### The subroutine to detect splicng signals and paired read mapping signals from the inputted alignments
	### It also differentiates BSJ reads from non-BSJ reads according to adapted maximum likelihood estimation
	### based on multiple seed matching in genomic region 1 and 2
	### key function is index in Perl

	sub gtag_pem_repeat {
		my ( $chr, $strand, @str, $str3, $str2, @str_ok, $str2_ok, $site1, $site2, $end_adjt1, $end_adjt2 );
		### Variables are assigned according to inputted variables
		### Existence of str3 can be determined by number of inputted variables
		if (@_ == 12) {
			( $chr, $strand, $str[0], $str[1], $str2, $str_ok[0], $str_ok[1], $str2_ok, $site1, $site2, $end_adjt1, $end_adjt2 ) = @_;
		} else {
			( $chr, $strand, $str[0], $str[1], $str3, $str2, $str_ok[0], $str_ok[1], $str2_ok, $site1, $site2, $end_adjt1, $end_adjt2 ) = @_;
		}
		my ($end_string1, $end_string2, $sense_strand, $bibases_bingo);
		my ($tmp_site1, $tmp_site2, $adjt_bp);
		my $total_adjustment = $end_adjt1 + $end_adjt2;
		my $linear_range_size_min = 50_000;
		### Extract end strings from reference genome according to putative BSJ as well as adjustment
		if ($end_adjt2 >= 0) {
			($tmp_site1, $tmp_site2, $adjt_bp) = ($site1-$end_adjt1-1, $site2-$end_adjt1-1, 2+$end_adjt1+$end_adjt2);
			$end_string1 = substr( $chr_seq{$chr}, $site1-$end_adjt1-4, 4+$end_adjt1+$end_adjt2 );
			$end_string2 = substr( $chr_seq{$chr}, $site2-$end_adjt1-1, 4+$end_adjt1+$end_adjt2 );
		} else {
			($tmp_site1, $tmp_site2, $adjt_bp) = ($site1+$end_adjt2-1, $site2+$end_adjt2-1, 2-$end_adjt1-$end_adjt2);
			$end_string1 = substr( $chr_seq{$chr}, $site1+$end_adjt2-4, 4-$end_adjt1-$end_adjt2 );
			$end_string2 = substr( $chr_seq{$chr}, $site2+$end_adjt2-1, 4-$end_adjt1-$end_adjt2 );
		}
		### Pass end strings to index_compare to find possible splicing signals and the corresponding strand
		### Add U12 AS
		my $index_strand1 = &index_compare1($end_string1, $end_string2);
		my $index_strand2 = &index_compare2($end_string1, $end_string2);
		my $index_strand3 = &index_compare3($end_string1, $end_string2);
		my $circ_range_seq;
		my ($miss_count_min, $miss_count_max) = (3, 5);
		my %sense_strand_group;
		my %shifts;
		if ( scalar(keys %{$index_strand1}) != 0 ) {
			while ( my ($i, $strand) = each %{$index_strand1} ) {
				if ($strand eq '+') {
					$shifts{$i} = ['AG', 'GT'];
				} else {
					$shifts{$i} = ['AC', 'CT'];
				}
				$sense_strand_group{$i} = $strand;
			}
		}
		if ( scalar(keys %{$index_strand2}) != 0 ) {
			while ( my ($i, $strand) = each %{$index_strand2} ) {
				if ($strand eq '+') {
					$shifts{$i} = ['AG', 'GC'];
				} else {
					$shifts{$i} = ['GC', 'CT'];
				}
				$sense_strand_group{$i} = $strand;
			}
		}
		if ( scalar(keys %{$index_strand3}) != 0 ) {
			while ( my ($i, $strand) = each %{$index_strand3} ) {
				if ($strand eq '+') {
					$shifts{$i} = ['AC', 'AT'];
				} else {
					$shifts{$i} = ['AT', 'GT'];
				}
				$sense_strand_group{$i} = $strand;
			}
		}
		### Exon boundanries/strand recorded in provided GTF/GFF file can also be used
		for my $i (0 .. $adjt_bp) {
			if ( !exists $shifts{$i} and exists $exon_start{$chr} and exists $exon_start{$chr}{$tmp_site1+$i} and exists $exon_end{$chr}{$tmp_site2+$i} ) {
				while ( my ($start_gene, undef) = each %{$exon_start{$chr}{$tmp_site1+$i}} ) {
					while ( my ($end_gene, undef) = each %{$exon_end{$chr}{$tmp_site2+$i}} ) {
						if ($start_gene eq $end_gene) {
							$shifts{$i} = [substr($chr_seq{$chr}, $tmp_site1+$i-3, 2), substr($chr_seq{$chr}, $tmp_site2+$i, 2)];
							$sense_strand_group{$i} = ${$gene_loci{$chr}{$start_gene}}[2];
						}
					}
				}
			}
		}
		### Determination of putative BSJ loci, as well as the sequence in region 1 and region 2
		if ( scalar(keys %shifts) > 0 ) {
			my $junc_ok = 0;
			while ( my ($shift, $bibases) = each %shifts ) {
				my ($diff_adjt, @str_new);
				my $initial_size = 5+2;
				if ($end_adjt2 >= 0) {
					$diff_adjt = $shift - 1 - $end_adjt1;
				} else {
					$diff_adjt = $shift - 1 + $total_adjustment - $end_adjt1;
				}
				my $site1_new = $site1+$diff_adjt;
				my $site2_new = $site2+$diff_adjt;
				if ( $diff_adjt >= 0 ) {
					my $str_adjustment = substr($str[0], 0, $diff_adjt);
					$str_new[1] = $str[1].$str_adjustment;
					$str_new[0] = substr($str[0], $diff_adjt);
				} elsif ( $diff_adjt < 0 ) {
					my $str_adjustment = substr($str[1], length($str[1]) + $diff_adjt);
					$str_new[0] = $str_adjustment.$str[0];
					$str_new[1] = substr($str[1], 0, length($str[1]) + $diff_adjt);
				}
				$str_new[0] = ${$bibases}[0].$str_new[0];
				$str_new[1] = $str_new[1].${$bibases}[1];
				my $initial_seq1 = substr($str_new[0], 0, $initial_size);
				my $initial_seq2 = substr($str_new[1], length($str_new[1])-$initial_size, $initial_size);
				$circ_range_seq = substr($chr_seq{$chr}, $site1_new-3, $site2_new-$site1_new+5);
				if ( substr($circ_range_seq, 0, length($initial_seq1)) eq $initial_seq1 and substr($circ_range_seq, length($circ_range_seq)-length($initial_seq2), length($initial_seq2)) eq $initial_seq2 ) {
					$junc_ok = 1;
					for my $i (0 .. 1) {
						unless ($str_ok[$i] == 1) {
							my $linear_range;
							my $len_str = length($str_new[$i]);
							if ( $i == 1 and $site2_new-$site1_new+5 >= $linear_range_size_min ) {
								if ( 2*$site1_new >= $site2_new+6 ) {
									$linear_range = substr( $chr_seq{$chr}, 2*$site1_new-$site2_new-6, $site2_new-$site1_new+5 );
								} else {
									$linear_range = substr( $chr_seq{$chr}, 0, $site1_new-1 );
								}
							} elsif ($i == 1) {
								if ($site1_new >= $linear_range_size_min+1) {
									$linear_range = substr( $chr_seq{$chr}, $site1_new-$linear_range_size_min-1, $linear_range_size_min );
								} else {
									$linear_range = substr( $chr_seq{$chr}, 0, $site1_new-1 );
								}
							} elsif ($i == 0 and $site2_new-$site1_new+5 >= $linear_range_size_min) {
								$linear_range = substr( $chr_seq{$chr}, $site2_new, $site2_new-$site1_new+5 );
							} else {
								$linear_range = substr( $chr_seq{$chr}, $site2_new, $linear_range_size_min );
							}
							### Comparison of detected seeds in the two regions
							### Seed is searched iteratively in the descending order of length
							for my $window_unit ( 9, 7, 5, 4, 3 ) {
								if ($len_str < $window_unit*2) {
									next;
								}
								my $window_step = $window_unit;
								my $window_size = $window_unit*2;
								my $trial = int( ($len_str-$window_size)/$window_step );
								my ($miss_count, $total_miss_count, $cont_miss_count) = (0, 0, 0);
								my ($miss_count2, $total_miss_count2, $cont_miss_count2) = (0, 0, 0);
								my (@loci, @loci2);
								for my $n ( 0 .. $trial ) {
									my $seq = substr( $str_new[$i], $n*$window_step, $window_size );
									#print "$str_new[$i]\t$window_step\t$i\t$n\t$len_str\t$trial\t$str[$i]\t$str[1-$i]\t$shift\t$diff_adjt\t$total_adjustment\n" if !defined $seq;
									my $locus = index($circ_range_seq, $seq);
									my $locus2 = index($linear_range, $seq);
									if ($locus >= 0) {
										push @loci, $locus;
										$miss_count = 0;
									} else {
										$total_miss_count ++;
										$miss_count ++;
										if ($miss_count > $cont_miss_count) {
											$cont_miss_count = $miss_count;
										}
									}
									if ($locus2 >= 0) {
										push @loci2, $locus2;
									} else {
										$total_miss_count2 ++;
									}
								}
								if ($len_str % $window_unit != 0) {
									$trial ++;
									my $seq = substr($str_new[$i], $len_str-$window_size, $window_size);
									my $locus = index($circ_range_seq, $seq);
									my $locus2 = index($linear_range, $seq);
									if ($locus >= 0) {
										push @loci, $locus;
										$miss_count = 0;
									} else {
										$total_miss_count ++;
										$miss_count ++;
										if ($miss_count > $cont_miss_count) {
											$cont_miss_count = $miss_count;
										}
									}
									if ($locus2 >= 0) {
										push @loci2, $locus2;
									} else {
										$total_miss_count2 ++;
									}
								}
								if ( $total_miss_count2 == 0 and $total_miss_count == 0 ) {
									if ( &distance_loci(\@loci, \@loci2, $window_step) == 1 ) {
										$str_ok[$i] = 1;
										last;
									} else {
										return [0];
									}
								} elsif ( $total_miss_count2 <= $total_miss_count ) {
									if (@loci2>0) {
										return [0];
									} else {
									}
								} elsif ($cont_miss_count > $miss_count_max) {
								} elsif ( $total_miss_count*2 > $trial ) {
								} else {
									$str_ok[$i] = 1;
									last;
								}
							}
						}
					}
					if ($str_ok[0] == 1 and $str_ok[1] == 1) {
						@str = @str_new;
						($site1, $site2) = ($site1_new, $site2_new);
						$sense_strand = $sense_strand_group{$shift};
						$bibases_bingo = $bibases;
						last;
					} else {
						return [0];
					}
				}
			}
			### If str3 exists, its seeds will also be searched
			if ($junc_ok == 1 and $str_ok[0] == 1 and $str_ok[1] == 1) {
				if (defined $str3) {
					my ($window_step, $window_size) = (5, 10);
					my ($miss_count, $total_miss_count, $cont_miss_count) = (0, 0, 0);
					for my $n ( 0 .. int( (length($str3)-$window_size)/$window_step ) ) {
						my $seq = substr( $str3, $n*$window_step, $window_size );
						if (rindex($circ_range_seq, $seq) >= 0) {
							$miss_count = 0;
						} else {
							$total_miss_count ++;
							$miss_count ++;
							if ($miss_count > $cont_miss_count) {
								$cont_miss_count = $miss_count;
							}
						}
					}
					if ($cont_miss_count > $miss_count_max) {
						return [0];
					} elsif ( ($total_miss_count-1)*2 > int( (length($str3)-$window_size)/$window_step ) ) {
						return [0];
					}
				}
				{
					### Paired end mapping signal is also detected by multiple seed matching
					if (length($str2) > 5) {
						my $pem_null_range_seq;
						if ( $strand == 1 and $site2-$site1+5 >= $linear_range_size_min ) {
							if ( 2*$site1 >= $site2+6 ) {
								$pem_null_range_seq = substr($chr_seq{$chr}, 2*$site1-$site2-6, $site2-$site1+5);
							} else {
								$pem_null_range_seq = substr($chr_seq{$chr}, 0, $site1-1);
							}
						} elsif ($strand == 1) {
							if ($site1 >= $linear_range_size_min+1) {
								$pem_null_range_seq = substr($chr_seq{$chr}, $site1-$linear_range_size_min-1, $linear_range_size_min);
							} else {
								$pem_null_range_seq = substr($chr_seq{$chr}, 0, $site1-1);
							}
						} elsif ( $strand == 0 and $site2-$site1+5 >= $linear_range_size_min ) {
							$pem_null_range_seq = substr($chr_seq{$chr}, $site2, $site2-$site1+5);
						} else {
							$pem_null_range_seq = substr($chr_seq{$chr}, $site2, $linear_range_size_min);
						}
						my ($window_step, $window_size) = (5, 10);
						my $trial = int( (length($str2)-$window_size)/$window_step );
						my ($miss_count, $total_miss_count, $cont_miss_count) = (0, 0, 0);
						my ($miss_count2, $total_miss_count2, $cont_miss_count2) = (0, 0, 0);
						my (@loci, @loci2);
						for my $n ( 0 .. $trial ) {
							my $seq = substr($str2, $n*$window_step, $window_size);
							my $locus = index($circ_range_seq, $seq);
							if ($locus >= 0) {
								push @loci, $locus;
								$miss_count = 0;
							} else {
								$total_miss_count ++;
								$miss_count ++;
								if ($miss_count > $cont_miss_count) {
									$cont_miss_count = $miss_count;
								}
							}
							if (length($pem_null_range_seq) > 0) {
								my $locus2 = index($pem_null_range_seq, $seq);
								if ($locus2 >= 0) {
									push @loci2, $locus2;
								} else {
									$total_miss_count2 ++;
								}
							}
						}
						if (length($pem_null_range_seq) > 0) {
							if ( $total_miss_count2 == 0 and $total_miss_count == 0 ) {
								if ( &distance_loci( \@loci, \@loci2, $window_step ) == 1 ) {
									return [1, $site1, $site2, $sense_strand, $bibases_bingo];
								} else {
									return [-2, $site1, $site2, $sense_strand, $bibases_bingo];
								}
							} elsif ( $total_miss_count2 <= $total_miss_count ) {
								if (@loci2>0) {
									return [-2, $site1, $site2, $sense_strand, $bibases_bingo];
								} else {
									return [-1, $site1, $site2, $sense_strand, $bibases_bingo];
								}
							} elsif ( $total_miss_count*4 > $trial*3 and $str2_ok == 0 ) {
								return [-2, $site1, $site2, $sense_strand, $bibases_bingo];
							} elsif ($cont_miss_count > $miss_count_max) {
								return [-1, $site1, $site2, $sense_strand, $bibases_bingo];
							} elsif ( $total_miss_count*2 > $trial ) {
								return [-1, $site1, $site2, $sense_strand, $bibases_bingo];
							} else {
								return [1, $site1, $site2, $sense_strand, $bibases_bingo];
							}
						} elsif ( $total_miss_count*4 > $trial*3 and $str2_ok == 0 ) {
							return [-2, $site1, $site2, $sense_strand, $bibases_bingo];
						} elsif ($cont_miss_count > $miss_count_max) {
							return [-1, $site1, $site2, $sense_strand, $bibases_bingo];
						} elsif ( $total_miss_count*2 > $trial ) {
							return [-1, $site1, $site2, $sense_strand, $bibases_bingo];
						} else {
							return [1, $site1, $site2, $sense_strand, $bibases_bingo];
						}
					} else {
						return [1, $site1, $site2, $sense_strand, $bibases_bingo];
					}
				}
			} else {
				return [0];
			}
		} else {
			return [0];
		}
	}

	### The subroutine to scan divided SAM (1st scanning)
	### pass all alignments of the same read to subroutine &mapping_check_parallel
	### print candidate BSJ reads to the corresponding temporary file
	### input: 1.${$_[0]}[0]: file name of divided SAM; 2.${$_[0]}[1]: the adjusted first read of the divided SAM

	sub parallel_scan1 {
		my ($file, $key_read) = @{$_[0]};
		my ($pre_read, @PE_reads, $pinhead);
		my (@more_reads);
		open IN, "<", $input_dir.$file or die "cannot open $input_dir$file: $!";
		open OUT, ">", $output_dir.$file.'.list' or die "cannot write tmp $output_dir$file.list: $!";
		while (<IN>) {
			chomp;
			my @line = split (/\t/, $_, 2);
			if (defined $pinhead) {
				if (defined $pre_read and $pre_read ne $line[0]) {
					### variable tag records the inference whether the pre_read is a candidate BSJ read 
					### 0 signifies no; others signify yes
					my $tag = &mapping_check_parallel($pre_read, @PE_reads);
					if (${$tag}[0] != 0) {
						print OUT "$pre_read";
						print OUT "\t$_" for @{$tag};
						print OUT "\n";
					}
					@PE_reads = ();
				}
				push @PE_reads, $_;
				$pre_read = $line[0];
			} elsif ($line[0] eq $key_read) {
				$pinhead = 1;
			}
		}
		my $tag = &mapping_check_parallel($pre_read, @PE_reads);
		if (${$tag}[0] != 0) {
			print OUT "$pre_read";
			print OUT "\t$_" for @{$tag};
			print OUT "\n";
		}
		close IN;
		close OUT;
	}

	### The subroutine to scan divided SAM (2nd scanning)
	### pass all alignments of the same read to subroutine &circ_read_candidate
	### print candidate BSJ reads to the corresponding temporary file2
	### input: 1.${$_[0]}[0]: file name of divided SAM; 2.${$_[0]}[1]: the adjusted first read of the divided SAM

	sub parallel_scan2 {
		my ($file, $key_read) = @{$_[0]};
		my ($pre_read, @PE_reads, $pinhead);
		my (@linear_cp_group, @more_reads, @non_reads, @wrong_reads);
		open IN, "<", $input_dir.$file or die "cannot open $input_dir$file: $!";
		open OUT, ">", $output_dir.$file.'.list2' or die "cannot write tmp $output_dir$file.list2: $!";
		while (<IN>) {
			chomp;
			my @line = split (/\t/, $_, 2);
			if (defined $pinhead) {
				if ( defined $pre_read and $pre_read ne $line[0] and !exists $cigar_read{$pre_read} ) {
					### variable tag records the inference whether the pre_read is a candidate BSJ read 
					### 0 signifies no; others signify yes
					my $tag = &circ_read_candidate(@PE_reads);
					if (${$tag}[0] != 0) {
						print OUT "$pre_read";
						print OUT "\t$_" for @{$tag};
						print OUT "\n";
					} elsif (${$tag}[0] == 0) {
						print OUT "$pre_read\t0";
						print OUT "\t$_" for @{${$tag}[1]};
						print OUT "\n";
					}
					@PE_reads = ();
				} elsif ( defined $pre_read and $pre_read ne $line[0] ) {
					@PE_reads = ();
				}
				push @PE_reads, $_;
				$pre_read = $line[0];
			} elsif ($line[0] eq $key_read) {
				$pinhead = 1;
			}
		}
		my $tag = &circ_read_candidate(@PE_reads);
		if (${$tag}[0] != 0) {
			print OUT "$pre_read";
			print OUT "\t$_" for @{$tag};
			print OUT "\n";
		} elsif (${$tag}[0] == 0) {
			print OUT "$pre_read\t0";
			print OUT "\t$_" for @{${$tag}[1]};
			print OUT "\n";
		}
		close IN;
		close OUT;
	}

	### The subroutine to identify additional candidate BSJ read* during the 2nd scanning of divided SAM
	### *Such a BSJ read often has a segment flanking the BSJ that cannot be precisely aligned to the reference sequence by short read mappers
	### Reads with alignment related to known candidate BSJ read are passed to read_circ_range
	### Alignment styles include: xS/HyMzS/H, xS/HyM, xMyS/H
	### input: 1.$_[0 .. n]: local alignments of the read

	sub circ_read_candidate {
		my (@reads, %read_seq);
		my @linear_cp_group;
		my ($pinhead, $pinhead2) = (0, 0);
		# for each alignment of the read pair, decide its orignal read (1st read or 2nd read) according to FLAG colomn in SAM
		# record the alignment into hash %read_seq with orignal read as key and strand and sequence as value
		for my $k (@_) {
			my @line = split (/\t/, $k);
			if (&ten2b($line[1],7) == 1) {
				push @{$reads[1]}, $k;
				unless (exists $read_seq{'1'}) {
					$read_seq{'1'} = [&ten2b($line[1],5), $line[9]];
				}
			} else {
				push @{$reads[0]}, $k;
				unless (exists $read_seq{'0'}) {
					$read_seq{'0'} = [&ten2b($line[1],5), $line[9]];
				}
			}
		}
		for my $i (0 .. 1) {
			for my $k (@{$reads[$i]}) {
				my @line = split /\t/, $k;
				my $length_read = length($read_seq{$i}[1]);
				if ($line[5] eq "${length_read}M") {
					my $division1 = int($line[3]/500);
					for my $i ($division1 .. $division1+1) {
						my $chr_div = "$line[2]:$i";
						for my $j ( @{$chr_division1{$chr_div}} ) {
							if ($line[3] <= $site1_cluster[$j]-6 and $line[3]+$length_read-1>=$site1_cluster[$j]+6) {
								push @linear_cp_group, $j;

							}
						}
						for my $j ( @{$chr_division2{$chr_div}} ) {
							if ($line[3] <= $site2_cluster[$j]-6 and $line[3]+$length_read-1>=$site2_cluster[$j]+6) {
								push @linear_cp_group, $j;
							}
						}
					}
				} elsif ($line[5] eq '*') {
					next;
				# For partial alignment, infer whether the correponding read is from detected candidate circRNA
				} else {
					my $MSID = &MSID($line[5], length(${$read_seq{$i}}[1]));
					my $ten2b5 = &ten2b($line[1],5);
					# Alignment style: xS/HyM
					if ( ${$MSID}[0] == -1 and $line[4] >= $min_mapq_uni ) {
						my $chr_div = "$line[2]:$line[3]";
						for my $clusterID ( @{$chr_range1{$chr_div}} ) {
							# adjust the putative boundaris according to the candidate BSJ junction(s)
							my $distance1 = $line[3] - $site1_cluster[$clusterID];
							# extract the corresponding sequence of key segment in the read
							my ($str, $str2);
							if ( $read_seq{$i}[0] == $ten2b5 ) {
								$str = substr($read_seq{$i}[1], 0, ${$MSID}[1]-$distance1);
							} else {
								$str = substr(&comp_rev($read_seq{$i}[1]), 0, ${$MSID}[1]-$distance1);
							}
							if (length($str) >= 5) {
								# extract the corresponding sequence of its paired read
								if ( $read_seq{1-$i}[0] != $ten2b5 ) {
									$str2 = $read_seq{1-$i}[1];
								} else {
									$str2 = &comp_rev( $read_seq{1-$i}[1] );
								}
								my $str2_ok = 0;
								for my $k2 (@{$reads[1-$i]}) {
									my @line2= split /\t/, $k2;
									my $MSID2 = &MSID($line2[5], length(${$read_seq{1-$i}}[1]));
									if ( $line2[2] eq $line[2] and $line2[4] >= $min_mapq_uni ) {
										if ( &ten2b($line2[1], 5) != $ten2b5 and $line2[3]>=$site1_cluster[$clusterID]-6 and $line2[3] + ${$MSID2}[-1] - 1 <= $site2_cluster[$clusterID] + 6 ) {
											$str2_ok = 1;
											last;
										} else {
											$str2_ok = -1;
											last;
										}
									}
								}
								# pass the info of the read pair to read_circ_range for further determination
								$pinhead = &read_circ_range($ten2b5, 'sm', $clusterID, $str, $str2, $str2_ok);
								if ($pinhead != 0) {
									return [$pinhead, $clusterID, 1, $line[5]];
								} else {
									push @linear_cp_group, $clusterID;
								}
							} else {
								$pinhead = 0;
							}
						}
					# Alignment style: xMyS/H
					} elsif ( ${$MSID}[0] == 1 and $line[4] >= $min_mapq_uni ) {
						my $chr_div = "$line[2]:".($line[3]+${$MSID}[-1]-1);
						for my $clusterID ( @{$chr_range2{$chr_div}} ) {
							# adjust the putative boundaris according to the candidate BSJ junction(s)
							my $distance1 = $line[3]+${$MSID}[-1]-1 - $site2_cluster[$clusterID];
							# extract the corresponding sequence of key segment in the read
							my ($str, $str2);
							if ( $read_seq{$i}[0] == $ten2b5 ) {
								$str = substr($read_seq{$i}[1], ${$MSID}[1] - $distance1);
							} else {
								$str = substr(&comp_rev($read_seq{$i}[1]), ${$MSID}[1]-$distance1);
							}
							if (length($str) >= 5) {
								# extract the corresponding sequence of its paired read
								if ( $read_seq{1-$i}[0] != $ten2b5 ) {
									$str2 = $read_seq{1-$i}[1];
								} else {
									$str2 = &comp_rev( $read_seq{1-$i}[1] );
								}
								my $str2_ok = 0;
								for my $k2 (@{$reads[1-$i]}) {
									my @line2= split /\t/, $k2;
									my $MSID2 = &MSID( $line2[5], length(${$read_seq{1-$i}}[1]) );
									if ( $line2[2] eq $line[2] and $line2[4] >= $min_mapq_uni ) {
										if ( &ten2b($line2[1], 5) != $ten2b5 and $line2[3]>=$site1_cluster[$clusterID]-6 and $line2[3] + ${$MSID2}[-1] - 1 <= $site2_cluster[$clusterID] + 6 ) {
											$str2_ok = 1;
											last;
										} else {
											$str2_ok = -1;
											last;
										}
									}
								}
								# pass the info of the read pair to read_circ_range for further determination
								$pinhead = &read_circ_range( $ten2b5, 'ms', $clusterID, $str, $str2, $str2_ok );
								if ($pinhead != 0) {
									return [$pinhead, $clusterID, 2, $line[5]];
								} else {
									push @linear_cp_group, $clusterID;
								}
							} else {
								$pinhead = 0;
							}
						}
					# Alignment style: xS/HyMzS/H
					} elsif (${$MSID}[0] == 10 and $line[4] >= $min_mapq_uni) {
						my $chr_div = "$line[2]:$line[3]";
						for my $clusterID( @{$chr_range1{$chr_div}} ) {
							# adjust the putative boundaris according to the candidate BSJ junction(s)
							my $distance1 = $line[3] - $site1_cluster[$clusterID];
							# extract the corresponding sequence of key segment in the read
							my ($str, $str2, $str3);
							if ( $read_seq{$i}[0] == $ten2b5 ) {
								$str = substr($read_seq{$i}[1], 0, ${$MSID}[1]-$distance1);
								$str3 = substr($read_seq{$i}[1], $length_read-${$MSID}[2]);
							} else {
								my $comp_rev_read_seq = &comp_rev($read_seq{$i}[1]);
								$str = substr($comp_rev_read_seq, 0, ${$MSID}[1]-$distance1);
								$str3 = substr($comp_rev_read_seq, $length_read-${$MSID}[2]);
							}
							if (length($str) >= 5) {
								# extract the corresponding sequence of its paired read
								if ( $read_seq{1-$i}[0] != $ten2b5 ) {
									$str2 = $read_seq{1-$i}[1];
								} else {
									$str2 = &comp_rev( $read_seq{1-$i}[1] );
								}
								my $str2_ok = 0;
								for my $k2 (@{$reads[1-$i]}) {
									my @line2= split /\t/, $k2;
									my $MSID2 = &MSID($line2[5], length(${$read_seq{1-$i}}[1]));
									if ( $line2[2] eq $line[2] and $line2[4] >= $min_mapq_uni ) {
										if ( &ten2b($line2[1], 5) != $ten2b5 and $line2[3] >= $site1_cluster[$clusterID]-6 and $line2[3] + ${$MSID2}[-1] - 1 <= $site2_cluster[$clusterID] + 6 ) {
											$str2_ok = 1;
											last;
										} else {
											$str2_ok = -1;
											last;
										}
									}
								}
								# pass the info of the read pair to read_circ_range for further determination
								$pinhead = &read_circ_range( $ten2b5, 'sm', $clusterID, $str, $str2, $str3, $str2_ok );
								if ($pinhead != 0) {
									return [$pinhead, $clusterID, 3, $line[5]];
								} else {
									push @linear_cp_group, $clusterID;
								}
							} else {
								$pinhead = 0;
							}
						}
						$chr_div = "$line[2]:".($line[3]+${$MSID}[-1]-1);
						for my $clusterID ( @{$chr_range2{$chr_div}} ) {
							my $distance1 = $line[3]+${$MSID}[-1]-1 - $site2_cluster[$clusterID];
							my ($str, $str2, $str3);
							if ( $read_seq{$i}[0] == $ten2b5 ) {
								$str = substr($read_seq{$i}[1], $length_read-${$MSID}[2]-$distance1);
								$str3 = substr($read_seq{$i}[1], 0, ${$MSID}[1]);
							} else {
								my $comp_rev_read_seq = &comp_rev($read_seq{$i}[1]);
								$str = substr($comp_rev_read_seq, $length_read-${$MSID}[2]-$distance1);
								$str3 = substr($comp_rev_read_seq, 0, ${$MSID}[1]);
							}
							if (length($str) >= 5) {
								if ( $read_seq{1-$i}[0] != $ten2b5 ) {
									$str2 = $read_seq{1-$i}[1];
								} else {
									$str2 = &comp_rev( $read_seq{1-$i}[1] );
								}
								my $str2_ok = 0;
								for my $k2(@{$reads[1-$i]}) {
									my @line2= split /\t/, $k2;
									my $MSID2 = &MSID($line2[5], length(${$read_seq{1-$i}}[1]));
									if ( $line2[2] eq $line[2] and $line2[4] >= $min_mapq_uni) {
										if ( &ten2b($line2[1], 5) != $ten2b5 and $line2[3] >= $site1_cluster[$clusterID]-6 and $line2[3] + $$MSID2[-1] - 1 <= $site2_cluster[$clusterID] + 6 ) {
											$str2_ok = 1;
											last;
										} else {
											$str2_ok = -1;
											last;
										}
									}
								}
								$pinhead = &read_circ_range( $ten2b5, 'ms', $clusterID, $str, $str2, $str3, $str2_ok );
								if ($pinhead != 0) {
									return [$pinhead, $clusterID, 3, $line[5]];
								} else {
									push @linear_cp_group, $clusterID;
								}
							} else {
								$pinhead = 0;
							}
						}
					}
					my $division1 = int($line[3]/500);
					for my $i ($division1 .. $division1+1) {
						my $chr_div = "$line[2]:$i";
						for my $j ( @{$chr_division1{$chr_div}} ) {
							if ( $line[3] <= $site1_cluster[$j]-6 and $line[3]+${$MSID}[-1]-1>=$site1_cluster[$j]+6 ) {
								push @linear_cp_group, $j;
							}
						}
						for my $j ( @{$chr_division2{$chr_div}} ) {
							if ( $line[3] <= $site2_cluster[$j]-6 and $line[3]+${$MSID}[-1]-1>=$site2_cluster[$j]+6 ) {
								push @linear_cp_group, $j;
							}
						}
					}
				}
			}
		}
		if ($pinhead == 0) {
			my %unique_linear_cp;
			$unique_linear_cp{$_}++ for @linear_cp_group;
			@linear_cp_group = keys(%unique_linear_cp);
			[0, \@linear_cp_group];
		}
	}

	### The subroutine to differentiate additional BSJ reads from non-BSJ reads according to adapted maximum likelihood estimation
	### based on multiple seed matching in genomic region 1 and 2
	### Paired read mapping signals are also detected if possible
	### key function is index in Perl

	sub read_circ_range {
		my ($ten2b5, $mode, $clusterID, $str, $str2, $str3, $str2_ok);
		### Variables are assigned according to inputted variables
		### Existence of str3 can be determined by number of inputted variables
		if ( @_ == 6 ) {
			($ten2b5, $mode, $clusterID, $str, $str2, $str2_ok) = @_;
		} else {
			($ten2b5, $mode, $clusterID, $str, $str2, $str3, $str2_ok) = @_;
		}
		my $initial_size = 5+2;
		my ($miss_count_min, $miss_count_max) = (3, 5);
		my $linear_range_size_min = 50_000;
		my $circ_range_seq = substr( $chr_seq{$chr_cluster[$clusterID]}, $site1_cluster[$clusterID]-3, $site2_cluster[$clusterID]-$site1_cluster[$clusterID]+5 );
		$circ_range_seq = "\U$circ_range_seq";
		my $pem_null_range_seq;
		my $len_str = length($str)+2;
		my $tag = 0;
		### Alignment style: xS/HyM or xS/HyMzS/H
		if ($mode eq 'sm') {
			$str = $str.${$bibases_circ{$cand_circ_sort[$clusterID]}}[1];
			my $initial_seq = substr($str, $len_str-$initial_size, $initial_size);
			if (substr($circ_range_seq, length($circ_range_seq)-length($initial_seq), length($initial_seq)) eq $initial_seq) {
				my $linear_range;
				if ( $site2_cluster[$clusterID]-$site1_cluster[$clusterID]+5 >= $linear_range_size_min ) {
					if ( 2*$site1_cluster[$clusterID] >= $site2_cluster[$clusterID]+6 ) {
						$linear_range = substr( $chr_seq{$chr_cluster[$clusterID]}, 2*$site1_cluster[$clusterID]-$site2_cluster[$clusterID]-6, $site2_cluster[$clusterID]-$site1_cluster[$clusterID]+5 );
					} else {
						$linear_range = substr($chr_seq{$chr_cluster[$clusterID]}, 0, $site1_cluster[$clusterID]-1);
					}
				} else {
					if ($site1_cluster[$clusterID] >= $linear_range_size_min) {
						$linear_range = substr( $chr_seq{$chr_cluster[$clusterID]}, $site1_cluster[$clusterID]-$linear_range_size_min-1, $linear_range_size_min );
					} else {
						$linear_range = substr( $chr_seq{$chr_cluster[$clusterID]}, 0, $site1_cluster[$clusterID]-1 );
					}
				}
				### Comparison of detected seeds in the two regions
				### Seed is searched iteratively in the descending order of length
				for my $window_unit ( 9, 7, 5, 4, 3 ) {
					if ($len_str < $window_unit*2) {
						next;
					}
					my $window_step = $window_unit;
					my $window_size = $window_unit*2;
					my $trial = int( ($len_str-$window_size)/$window_step );
					my ($miss_count, $total_miss_count, $cont_miss_count) = (0, 0, 0);
					my ($miss_count2, $total_miss_count2, $cont_miss_count2) = (0, 0, 0);
					my (@loci, @loci2);
					for my $i ( 0 .. $trial ) {
						my $seq = substr( $str, $len_str-$i*$window_step-$window_size, $window_size );
						my $locus = rindex($circ_range_seq, $seq);
						my $locus2 = rindex($linear_range, $seq);
						if ($locus >= 0) {
							push @loci, $locus;
							$miss_count = 0;
						} else {
							$total_miss_count ++;
							$miss_count ++;
							if ($miss_count > $cont_miss_count) {
								$cont_miss_count = $miss_count;
							}
						}
						if ($locus2 >= 0) {
							push @loci2, $locus2;
						} else {
							$total_miss_count2 ++;
						}
					}
					if ($len_str % $window_unit != 0) {
						$trial ++;
						my $seq = substr($str, 0, $window_size);
						my $locus = rindex($circ_range_seq, $seq);
						my $locus2 = rindex($linear_range, $seq);
						if ($locus >= 0) {
							push @loci, $locus;
							$miss_count = 0;
						} else {
							$total_miss_count ++;
							$miss_count ++;
							if ($miss_count > $cont_miss_count) {
								$cont_miss_count = $miss_count;
							}
						}
						if ($locus2 >= 0) {
							push @loci2, $locus2;
						} else {
							$total_miss_count2 ++;
						}
					}
					if ($total_miss_count2 == 0 and $total_miss_count == 0) {
						if ( &distance_loci(\@loci, \@loci2, $window_step) == 1 ) {
							$tag = 1;
							last;
						} else {
							return 0;
						}
					} elsif ( $total_miss_count2 <= $total_miss_count ) {
						if (@loci2>0) {
							return 0;
						} else {
						}
					} elsif ($cont_miss_count > $miss_count_max) {
					} elsif ( $total_miss_count*2 > $trial ) {
					} else {
						$tag = 1;
						last;
					}
				}
				if ($tag == 1) {
					if ($ten2b5 == 1) {
						$pem_null_range_seq = $linear_range;
					}
				} else {
					return 0;
				}
			} else {
				return 0;
			}
		### Alignment style: xMyS/H or xS/HyMzS/H
		} else {
			$str = ${$bibases_circ{$cand_circ_sort[$clusterID]}}[0].$str;
			my $initial_seq = substr($str, 0, $initial_size);
			if ( substr($circ_range_seq, 0, length($initial_seq)) eq $initial_seq ) {
				my $linear_range;
				if ( $site2_cluster[$clusterID]-$site1_cluster[$clusterID]+5 >= $linear_range_size_min ) {
					$linear_range = substr( $chr_seq{$chr_cluster[$clusterID]}, $site2_cluster[$clusterID], $site2_cluster[$clusterID]-$site1_cluster[$clusterID]+5 );
				} else {
					$linear_range = substr( $chr_seq{$chr_cluster[$clusterID]}, $site2_cluster[$clusterID], $linear_range_size_min );
				}
				### Comparison of detected seeds in the two regions
				### Seed is searched iteratively in the descending order of length
				for my $window_unit ( 9, 7, 5, 4, 3 ) {
					if ($len_str < $window_unit*2) {
						next;
					}
					my $window_step = $window_unit;
					my $window_size = $window_unit*2;
					my $trial = int( ($len_str-$window_size)/$window_step );
					my ($miss_count, $total_miss_count, $cont_miss_count) = (0, 0, 0);
					my ($miss_count2, $total_miss_count2, $cont_miss_count2) = (0, 0, 0);
					my (@loci, @loci2);
					for my $i ( 0 .. $trial ) {
						my $seq = substr($str, $i*$window_step, $window_size);
						my $locus = index($circ_range_seq, $seq);
						my $locus2 = index($linear_range, $seq);
						if ($locus >= 0) {
							push @loci, $locus;
							$miss_count = 0;
						} else {
							$total_miss_count ++;
							$miss_count ++;
							if ($miss_count > $cont_miss_count) {
								$cont_miss_count = $miss_count;
							}
						}
						if ($locus2 >= 0) {
							push @loci2, $locus2;
						} else {
							$total_miss_count2 ++;
						}
					}
					if ($len_str % $window_unit != 0) {
						$trial ++;
						my $seq = substr($str, $len_str-$window_size, $window_size);
						my $locus = index($circ_range_seq, $seq);
						my $locus2 = index($linear_range, $seq);
						if ($locus >= 0) {
							push @loci, $locus;
							$miss_count = 0;
						} else {
							$total_miss_count ++;
							$miss_count ++;
							if ($miss_count > $cont_miss_count) {
								$cont_miss_count = $miss_count;
							}
						}
						if ($locus2 >= 0) {
							push @loci2, $locus2;
						} else {
							$total_miss_count2 ++;
						}
					}
					if ( $total_miss_count2 == 0 and $total_miss_count == 0 ) {
						if ( &distance_loci(\@loci, \@loci2, $window_step) == 1 ) {
							$tag = 1;
							last;
						} else {
							return 0;
						}
					} elsif ( $total_miss_count2 <= $total_miss_count ) {
						if (@loci2>0) {
							return 0;
						} else {
						}
					} elsif ( $cont_miss_count > $miss_count_max ) {
					} elsif ( $total_miss_count*2 > $trial ) {
					} else {
						$tag = 1;
						last;
					}
				}
				if ($tag == 1) {
					if ($ten2b5 == 0) {
						$pem_null_range_seq = $linear_range;
					}
				} else {
					return 0;
				}
			} else {
				return 0;
			}
		}
		### If str3 exists, its seeds will also be searched
		if (defined $str3) {
			my ($window_step, $window_size) = (5, 10);
			my ($miss_count, $total_miss_count, $cont_miss_count) = (0, 0, 0);
			for my $i ( 0 .. int( (length($str3)-$window_size)/$window_step )) {
				my $seq = substr($str3, $i*$window_step, $window_size);
				if (rindex($circ_range_seq, $seq) >= 0) {
					$miss_count = 0;
				} else {
					$total_miss_count ++;
					$miss_count ++;
					if ($miss_count > $cont_miss_count) {
						$cont_miss_count = $miss_count;
					}
				}
			}
			if ($cont_miss_count > $miss_count_max) {
				return 0;
			} elsif ( ($total_miss_count-1)*2 > int( (length($str3)-$window_size)/$window_step ) ) {
				return 0;
			} elsif ($miss_count <= $miss_count_max and $miss_count >= $miss_count_min) {
			}
		}
		{
			### Paired end mapping signal is also detected by multiple seed matching
			if (length($str2) > 5) {
				my ($window_step, $window_size) = (5, 10);
				my $trial = int( (length($str2)-$window_size)/$window_step );
				my ($miss_count, $total_miss_count, $cont_miss_count) = (0, 0, 0);
				my ($miss_count2, $total_miss_count2, $cont_miss_count2) = (0, 0, 0);
				my (@loci, @loci2);
				for my $i ( 0 .. $trial ) {
					my $seq = substr($str2, $i*$window_step, $window_size);
					my $locus = index($circ_range_seq, $seq);
					if ($locus >= 0) {
						push @loci, $locus;
						$miss_count = 0;
					} else {
						$total_miss_count ++;
						$miss_count ++;
						if ($miss_count > $cont_miss_count) {
							$cont_miss_count = $miss_count;
						}
					}
					if (length($pem_null_range_seq) > 0) {
						my $locus2 = index($pem_null_range_seq, $seq);
						if ($locus2 >= 0) {
							push @loci2, $locus2;
						} else {
							$total_miss_count2 ++;
						}
					}
				}
				if (length($pem_null_range_seq) > 0) {
					if ( $total_miss_count2 == 0 and $total_miss_count == 0 ) {
						if ( &distance_loci(\@loci, \@loci2, $window_step) == 1 ) {
						} else {
							return -2;
						}
					} elsif ( $total_miss_count2 <= $total_miss_count ) {
						if (@loci2>0) {
							return -2;
						} else {
							return -1;
						}
					} elsif ( $total_miss_count*4 > $trial*3 and $str2_ok == 0 ) {
						return -2;
					} elsif ($cont_miss_count > $miss_count_max ) {
						return -1;
					} elsif ( $total_miss_count*2 > $trial ) {
						return -1;
					} else {
					}
				} elsif ( $total_miss_count*4 > $trial*3 and $str2_ok == 0 ) {
					return -2;
				} elsif ( $cont_miss_count > $miss_count_max ) {
					return -1;
				} elsif ( $total_miss_count*2 > $trial ) {
					return -1;
				}
			}
		}
		1;
	}

	### The subroutine to evaluate the rationality of the distances between matched seeds when
	### unmatched seed found in neither of the genomic regions
	### input: 1.$_[0]: reference of index of matched seeds in genomic region 2
	### input: 2.$_[1]: reference of index of matched seeds in genomic region 1
	### input: 3.$_[2]: window step

	sub distance_loci {
		unless ( @{$_[0]} >= 2 and @{$_[1]} >= 2 ) {
			return 0;
		}
		my @totals;
		for my $i (0 .. 1) {
			for my $j (1 .. $#{$_[$i]}) {
				$totals[$i] += abs(${$_[$i]}[$j] - ${$_[$i]}[$j-1]);
			}
		}
		if ( $totals[0] <= $_[2]*@{$_[0]} and $totals[0]*20 < $totals[1] ) {
			1;
		} else {
			0;
		}
	}

	### The subroutine to record start and end or a transcript from a gtf/gff formatted annotation
	### input: 1.$_[0]: gene ID of the transcript; 2.$_[1 .. n]: annotated exons in the transcript
	### return %gene_exon_exist and %gene_exon which recorded each exon and the corresponding gene
	### return %gene_loci which recorded the start and end or a transcript

	sub split_transcript {
		my $gene = shift @_;
		my @line2 = split (/\t/, $_[0]);
		my $chr = $line2[0];
		my $initial_site = $line2[3];
		@line2 = split (/\t/, $_[-1]);
		my $final_site = $line2[4];
		my $strand = $line2[6];
		for (@_) {
			my @line = split /\t/;
			unless (exists $gene_exon_exist{$gene.":".$line[3].":".$line[4]}) {
				push @{ $gene_exon{ $gene } }, [ $line[3], $line[4] ];
				$gene_exon_exist{$gene.":".$line[3].":".$line[4]} = 1;
				$initial_site = $line[3] if $line[3] < $initial_site;
				$final_site = $line[4] if $line[4] > $final_site;
			}
		}
		if (exists $gene_loci{$chr}{$gene}) {
			$gene_loci{$chr}{$gene}[0] = $initial_site if $gene_loci{$chr}{$gene}[0] > $initial_site;
			$gene_loci{$chr}{$gene}[1] = $final_site if $gene_loci{$chr}{$gene}[1] < $final_site;
		} else {
			$gene_loci{$chr}{$gene} = [ $initial_site, $final_site, $strand ];
		}
	}

	### The subroutine to find paired splicing signals in two designated sequences
	### input: 1.$_[0]: sequence 1; 2.$_[1]: sequence 2
	### return paired splicing signals as well as their position in the seqeuences

	### Add U12 AS
	sub index_compare1 {
		my %base_index1;
		my %index_strand1;
		my @bibases = (['AC', 'AG'], ['CT', 'GT']);
		my @strand_index = ('-', '+');
		for my $i (0 .. 1) {
			for my $j (0 .. 1) {
				my $pre_index = -1;
				while (1) {
					my $index = index("\U$_[$i]", $bibases[$i][$j], $pre_index+1);
					last if $index == -1;
					$base_index1{$index}[$i] = $j;
					$pre_index = $index;
				}
			}
		}
		while ( my ($index, $i_ref) = each %base_index1 ) {
			if ( defined ${$i_ref}[0] and defined ${$i_ref}[1] and ${$i_ref}[0] == ${$i_ref}[1] ) {
				$index_strand1{$index} = $strand_index[${$i_ref}[0]];
			}
		}
		\%index_strand1;
	}
	
	sub index_compare2 {
		my %base_index2;
		my %index_strand2;
		my @bibases = (['GC', 'AG'], ['CT', 'GC']);
		my @strand_index = ('-', '+');
		for my $i (0 .. 1) {
			for my $j (0 .. 1) {
				my $pre_index = -1;
				while (1) {
					my $index = index("\U$_[$i]", $bibases[$i][$j], $pre_index+1);
					last if $index == -1;
					$base_index2{$index}[$i] = $j;
					$pre_index = $index;
				}
			}
		}
		while ( my ($index, $i_ref) = each %base_index2 ) {
			if ( defined ${$i_ref}[0] and defined ${$i_ref}[1] and ${$i_ref}[0] == ${$i_ref}[1] ) {
				$index_strand2{$index} = $strand_index[${$i_ref}[0]];
			}
		}
		\%index_strand2;
	}
	
	sub index_compare3 {
		my %base_index3;
		my %index_strand3;
		my @bibases = (['AT', 'AC'], ['GT', 'AT']);
		my @strand_index = ('-', '+');
		for my $i (0 .. 1) {
			for my $j (0 .. 1) {
				my $pre_index = -1;
				while (1) {
					my $index = index("\U$_[$i]", $bibases[$i][$j], $pre_index+1);
					last if $index == -1;
					$base_index3{$index}[$i] = $j;
					$pre_index = $index;
				}
			}
		}
		while ( my ($index, $i_ref) = each %base_index3 ) {
			if ( defined ${$i_ref}[0] and defined ${$i_ref}[1] and ${$i_ref}[0] == ${$i_ref}[1] ) {
				$index_strand3{$index} = $strand_index[${$i_ref}[0]];
			}
		}
		\%index_strand3;
	}

	### The subroutine to transfer a sequence composed of capital ATCG to its reverse complement
	### input: 1.$_[0]: sequence

	sub comp_rev {
		my $seq = reverse($_[0]);
		$seq =~ tr/ATCG/TAGC/;
		$seq;
	}

	### The subroutine to decipher MSID values in SAM
	### input: 1.$_[0]: MSID value; 2.$_[1]: read length

	sub MSID {
		my $read_length = $_[1];
		my @counts = split /[MSIDH]/, $_[0];
		if (@counts == 1) {
			if ($_[0] eq "${read_length}M") {
				[0, 0, 0, $read_length];
			} else {
				[0, undef, undef, -1];
			}
		} else {
			$_[0] =~ s/H/S/g;
			my @styles = split /\d+/, $_[0];
			shift @styles;

			if (@counts == 2) {
				if ( $styles[0] eq 'M' and $styles[1] eq 'S' ) {
					[1, $counts[0], $counts[0]-1, $counts[0]];
				} elsif ( $styles[0] eq 'S' and $styles[1] eq 'M' ) {
					[-1, $counts[0], 0, $counts[1]];
				} else {
					[0, undef, undef, -2];
				}
			} elsif (@counts == 3) {
				if ( $styles[0] eq 'S' and $styles[2] eq 'S' ) {
					[10, $counts[0], $counts[2], $counts[1]];
				} elsif ( $styles[0] eq 'M' and $styles[1] eq 'D' and $styles[2] eq 'M' ) {
					[0, 0, 0, $read_length+$counts[1]];
				} elsif ( $styles[0] eq 'M' and $styles[1] eq 'I' and $styles[2] eq 'M' ) {
					[0, 0, 0, $read_length-$counts[1]];
				} else {
					[0, undef, undef, -2];
				}
			} elsif ( $styles[0] eq 'M' and $styles[-1] eq 'S' ) {
				my ($M_sum, $D_sum);
				for my $i (0 .. $#styles-1) {
					if ($styles[$i] eq 'M') {
						$M_sum += $counts[$i];
					} elsif ($styles[$i] eq 'D') {
						$D_sum += $counts[$i];
					}
				}
				if (!defined $D_sum) {
					[1, $read_length-$counts[-1], $M_sum-1, $M_sum];
				} else {
					[1, $read_length-$counts[-1], $M_sum+$D_sum-1, $M_sum+$D_sum];
				}
			} elsif ( $styles[0] eq 'S' and $styles[-1] eq 'M' ) {
				my ($M_sum, $D_sum);
				for my $i (1 .. $#styles) {
					if ($styles[$i] eq 'M') {
						$M_sum += $counts[$i];
					} elsif ($styles[$i] eq 'D') {
						$D_sum += $counts[$i];
					}
				}
				if (!defined $D_sum) {
					[-1, $counts[0], 0, $M_sum];
				} else {
					[-1, $counts[0], 0, $M_sum+$D_sum];
				}
			} elsif ( $styles[0] eq 'M' and $styles[-1] eq 'M' ) {
				my ($M_sum, $D_sum);
				for my $i (0 .. $#styles) {
					if ($styles[$i] eq 'M') {
						$M_sum += $counts[$i];
					} elsif ($styles[$i] eq 'D') {
						$D_sum += $counts[$i];
					}
				}
				if (!defined $D_sum) {
					[0, 0, 0, $M_sum];
				} else {
					[0, 0, 0, $M_sum+$D_sum];
				}
			} elsif ( $styles[0] eq 'S' and $styles[-1] eq 'S' ) {
				my ($M_sum, $D_sum);
				for my $i (1 .. $#styles-1) {
					if ($styles[$i] eq 'M') {
						$M_sum += $counts[$i];
					} elsif ($styles[$i] eq 'D') {
						$D_sum += $counts[$i];
					}
				}
				if (!defined $D_sum) {
					[10, $counts[0], $counts[-1], $M_sum];
				} else {
					[10, $counts[0], $counts[-1], $M_sum+$D_sum];
				}
			} else {
				[0, undef, undef, -2];
			}
		}
	}

	### The subroutine to transfer a decimal number to a binary one and return the designated digit
	### input: 1.$_[0]: decimal number; 2.$_[1]: designated digit

	sub ten2b {
		my $b_string = sprintf("%b", $_[0]);
		if ($_[1] <= length($b_string)) {
			substr(reverse($b_string), $_[1]-1, 1);
		} else {
			0;
		}
	}

	### The subroutine to divide the SAM file according to thread number designated by user.
	### input: 1.$_[0]: SAM file; 2.$_[1]: thread number
	### system commands ls and split are used
	### record the names of divided SAM and the corresponding first read as key and value of %split_files_key_read

	sub split_sam_file {
		my ($in, $n) = @_;
		my $in_raw;
		if (rindex($in, "/")>=0) {
			$in_raw = (substr($in, rindex($in, "/")+1));
		} else {
			$in_raw = $in;
		}
		if ($n >= 2) {
			$input_dir = $output_dir;
			system "ls -l $in > $input_dir$in_raw.ls";
			open LS, "<", $input_dir."$in_raw.ls" or die "cannot open $input_dir$in_raw.ls: $!";
			my $size;
			while (<LS>) {
				chomp;
				my @line = split /\s+/;
				$size = $line[4];
			}
			close LS;
			unless ($size=~/^\d+$/) {
				if ($size=~/^\d+M$/) {
					$size *= 1024;
				} elsif ($size=~/^\d+G$/) {
					$size *= 1024*1024;
				} elsif ($size=~/^\d+T$/) {
					$size *= 1024*1024*1024;
				} else {
					die "CIRI cannot get size of $in to split it: $!";
				}
			}
			my $division;
			if ($size%$n != 0) {
				$division = int($size/$n)+1;
			} else {
				$division = int($size/$n);
			}
			system "split -b $division $in $input_dir$in_raw";
			my @split_files;
			opendir DIR, $input_dir or die "cannot open directory $input_dir: $!";
			for my $file (readdir DIR) {
				if ($file=~/$in_raw[a-z]+$/) {
					push @split_files, $file;
				}
			}
			closedir DIR;
			my @split_files_sort = sort{$a cmp $b} @split_files;
			for my $i (1 .. $#split_files_sort) {
				my (%read_name, @add_reads);
				open FILE, "<", $input_dir.$split_files_sort[$i] or die "cannot open $input_dir$split_files_sort[$i]: $!";
				while (<FILE>) {
					chomp;
					my @line = split /\t/;
					$read_name{$line[0]} ++;
					if (scalar(keys %read_name) >= 3) {
						$split_files_key_read{$split_files_sort[$i]} = $line[0];
						open OUTPUT, ">>", $input_dir.$split_files_sort[$i-1] or die;
						print OUTPUT "$_\n" for @add_reads;
						close OUTPUT;
						@add_reads = ();
						last;
					} else {
						push @add_reads, $_;
					}
				}
			}
			open FILE, "<", $input_dir.$split_files_sort[0] or die "cannot open $input_dir$split_files_sort[0]: $!";
			while (<FILE>) {
				chomp;
				my @line = split /\t/;
				unless (/^[@]/) {
					$split_files_key_read{$split_files_sort[0]} = $line[0];
					last;
				}
			}
			system "rm $input_dir$in_raw.ls";
		} else {
			open FILE, "<", $input_dir.$in_raw or die "cannot open $input_dir${in_raw}: $!";
			while (<FILE>) {
				chomp;
				my @line = split /\t/;
				unless (/^[@]/) {
					$split_files_key_read{$in_raw} = $line[0];
					last;
				}
			}
			close FILE;
		}
	}
}
