#!/usr/bin/env nextflow

nextflow.enable.dsl=2

// Define parameters
params.fastq_dir      = '/home/gpsr/webserver/cgidocs/anshu/RAPID-engine/rnaseq'
params.output_dir     = '/home/gpsr/webserver/cgidocs/anshu/RAPID-engine/rnaseq/rnaseq_results'
params.ref_genome     = '/home/gpsr/webserver/cgidocs/anshu/RAPID-engine/reference_genome/acinetobacter_genome.fa'
params.ref_annotation = '/home/gpsr/webserver/cgidocs/anshu/RAPID-engine/reference_genome/annotation.gff'
params.threads        = 32
params.work_dir      = '/home/gpsr/webserver/cgidocs/anshu/RAPID-engine/reference_genome/work1

// Process to build HISAT2 index
process build_index {
    tag "build_index"
    publishDir "${params.output_dir}/index", mode: 'copy'

    input:
    path genome

    output:
    path "index.*"

    script:
    """
    hisat2-build $genome index
    """
}

// Process for FastQC analysis
process fastqc_analysis {
    tag { sample_id }
    publishDir "${params.output_dir}/qc", mode: 'copy'

    input:
    tuple val(sample_id), path(read1), path(read2)

    output:
    path "qc_results/*"

    script:
    """
    mkdir -p qc_results
    fastqc -o qc_results $read1 $read2
    """
}

// Process for aligning reads
process align_reads {
    tag { sample_id }
    publishDir "${params.output_dir}/alignment", mode: 'copy'

    input:
    path index_files
    tuple val(sample_id), path(read1), path(read2)

    output:
    tuple val(sample_id), path("${sample_id}.sam")

    script:
    """
    echo "Aligning $sample_id with $read1 and $read2"
    hisat2 -p ${params.threads} --dta -x index -1 $read1 -2 $read2 -S ${sample_id}.sam
    """
}

// Process to convert SAM to BAM
process sam_to_bam {
    tag { sample_id }
    publishDir "${params.output_dir}/unsorted", mode: 'copy'

    input:
    tuple val(sample_id), path(sam_file)

    output:
    tuple val(sample_id), path("${sample_id}.bam")

    script:
    """
    echo "Converting $sam_file to bam for $sample_id"
    samtools view -bS $sam_file > ${sample_id}.bam
    """
}

// Process to sort BAM files
process sort_bam {
    tag { sample_id }
    publishDir "${params.output_dir}/sorted", mode: 'copy'

    input:
    tuple val(sample_id), path(bam_file)

    output:
    tuple val(sample_id), path("${sample_id}_sorted.bam")

    script:
    """
    samtools sort -@ ${params.threads} $bam_file -o ${sample_id}_sorted.bam
    """
}

// Process to count features
process feature_count {
    tag { sample_id }
    publishDir "${params.output_dir}/counts", mode: 'copy'

    input:
    tuple val(sample_id), path(sorted_bam_file)
    path annotation

    output:
    path "${sample_id}.count"

    script:
    """
    htseq-count -t gene -i Name -f bam $sorted_bam_file $annotation > ${sample_id}.count
    """
}

// Process to consolidate count files
process consolidate_counts {
    publishDir "${params.output_dir}", mode: 'copy'

    input:
    path count_files

    output:
    path "consolidated_count_table.tsv"

    script:
    """
    #!/usr/bin/env python3

    import pandas as pd
    import os

    # Specific lines to remove
    lines_to_remove = [
        '__no_feature',
        '__ambiguous',
        '__too_low_aQual',
        '__not_aligned',
        '__alignment_not_unique'
    ]

    # Get all count files
    count_files = [f for f in os.listdir('.') if f.endswith('.count')]

    # Initialize a dictionary to store consolidated data
    consolidated_counts = {}

    # Read each count file
    for count_file in count_files:
        sample_name = count_file.replace('.count', '')
        df = pd.read_csv(count_file, sep='\t', header=None, names=['gene', sample_name])
        
        # Remove specific lines
        df = df[~df['gene'].isin(lines_to_remove)]
        
        # Use gene as index and the count as value
        consolidated_counts[sample_name] = df.set_index('gene')[sample_name]

    # Combine all count data
    final_df = pd.concat(consolidated_counts.values(), axis=1)
    
    # Save the consolidated count table
    final_df.to_csv('consolidated_count_table.tsv', sep='\t')
    """
}

// Workflow
workflow {
    // Ensure all input files exist
    if (!file(params.ref_genome).exists()) error "Reference genome not found: ${params.ref_genome}"
    if (!file(params.ref_annotation).exists()) error "Annotation file not found: ${params.ref_annotation}"
    if (!file(params.fastq_dir).exists()) error "FASTQ directory not found: ${params.fastq_dir}"

    // Create a channel of all fastq files
    all_fastq_files = Channel
        .fromPath("${params.fastq_dir}/*.fastq.gz")
        .map { file -> 
            def matcher = file.name =~ /^(SRR\d+)_(\d)\.fastq\.gz$/
            if (matcher) {
                return [matcher.group(1), matcher.group(2), file]
            }
            return null
        }
        .filter { it != null }

    // Group files by sample ID
    samples = all_fastq_files
        .groupTuple(size: 2)
        .map { sample_id, read_numbers, files ->
            // Sort files to ensure read1 and read2 are in correct order
            def sorted_files = files.sort { a, b -> 
                def a_read = (a.name =~ /^(SRR\d+)_(\d)\.fastq\.gz$/)[0][2]
                def b_read = (b.name =~ /^(SRR\d+)_(\d)\.fastq\.gz$/)[0][2]
                a_read <=> b_read
            }
            [sample_id, sorted_files[0], sorted_files[1]]
        }

    // Execute pipeline steps
    index_files = build_index(file(params.ref_genome))
    
    // Check if samples channel has content
    samples.ifEmpty { 
        error "No input files found matching the pattern in ${params.fastq_dir}" 
    }

    qc_results = fastqc_analysis(samples)
    sam_files = align_reads(index_files, samples)
    bam_files = sam_to_bam(sam_files)
    sorted_bam_files = sort_bam(bam_files)
    count_files = feature_count(sorted_bam_files, file(params.ref_annotation))

    // Consolidate count files after generating them
    consolidate_counts(count_files.collect())

    // Log final outputs
    count_files.view { file -> "Feature count generated: $file" }

    // Debug: Print sample information
    samples.subscribe { sample ->
        println "Sample: ${sample[0]}, Read1: ${sample[1]}, Read2: ${sample[2]}"
    }
}
