0.1 load all packages required

library(cowplot)
library(ggforce)
library(ggrepel)
library(ggpubr)
library(knitr)
library(ggrastr)
library(DescTools)
library(paletteer)
library(ggalluvial)
library(plotly)
library(ggbeeswarm)
library(h2o)
library(ggokabeito)
library(scico)
library(tidyverse)

theme_set(theme_bw())

###################################################################################################
# args  =  commandArgs(TRUE)
# 
# argmat  =  sapply(strsplit(args, "="), identity)
# 
# for (i in seq.int(length = ncol(argmat))) {
#   assign(argmat[1, i], argmat[2, i])
# }
# 
# # available variables
# print(ls())
###################################################################################################

############################################################################################### ############################################################################################### ############################################################################################### # ########################### GL ping-pong analysis ############################################### ############################################################################################### ############################################################################################### ###############################################################################################

### get the table file name for Brennecke linkage
RAW=read_tsv("TE_piRNA_5p3p_matrix.txt",col_types=list(TE=col_character()))
RAW
## # A tibble: 27,565,427 × 6
##    TE            POS `5END_plus` `3END_plus` `5END_minus` `3END_minus`
##    <chr>       <dbl>       <dbl>       <dbl>        <dbl>        <dbl>
##  1 FBgn0003187     0           0           0            0            0
##  2 FBgn0003187     1           0           0            0            0
##  3 FBgn0003187     2           0           0            0            0
##  4 FBgn0003187     3           0           0            0            0
##  5 FBgn0003187     4           0           0            0            0
##  6 FBgn0003187     5           0           0            0            0
##  7 FBgn0003187     6           0           0            0            0
##  8 FBgn0003187     7           0           0            0            0
##  9 FBgn0003187     8           0           0            0            0
## 10 FBgn0003187     9           0           0            0            0
## # ℹ 27,565,417 more rows
START=0
END=19
POSITION=9
START.position_B=1-START
LINK.position=POSITION-START+1
SPAN=END-START+1
THRESHOLD=-1

TEs = RAW %>%
  distinct(TE) %>%
  filter(TE=="FBgn0010225")%>%
  pull(TE)%>%
  head(n=10000)

results_old <- data.frame(
  TE = character(),
  mean = numeric(),
  sd = numeric(),
  z_score = numeric(),
  stringsAsFactors = FALSE
)

for(currTE in TEs){
  # print(currTE)

  TABLE = RAW %>%
    filter(TE==currTE)

  LENGTH = nrow(TABLE)
  END.position_B = LENGTH-END

  if( LENGTH >30){

    table_a = TABLE$`5END_plus`
    table_b = TABLE$`5END_minus`
  
    #not sure why this is required
    if (POSITION > 0){
      start.position_R = 1
      end.position_R = LENGTH-POSITION
    } else {
      start.position_R = 1-POSITION
      end.position_R = LENGTH
    }
  
    ### define a data.frame and a vector
    offset <- data.frame()
    sum_offset = NULL
  
    ### core of Brennecke linkage calculation
    offset <- sapply(1:SPAN, function(i) {
      sapply(START.position_B:END.position_B, function(j) {
        # Only calculate offset if the initiating 5' end is above threshold
        if(table_a[j] > THRESHOLD) {
          table_a[j]*table_b[j+START+i-1]/sum(table_a[(j+START):(j+END)])

        } else {
          0
        }
      })
    })

    offset[is.na(offset)] <- 0
  
    total_offset_sum = sum(offset)
  
    # Initialize sum_offset3 with zeros
    sum_offset3 = rep(0, SPAN + 1)
  
    # Only calculate z-score and offsets if we have valid pairs
    if(total_offset_sum > 0) {
      for (i in 1:SPAN) {
        sum_offset[i] = sum(offset[,i])/total_offset_sum
      }

      # plotNAME = paste("pingpong-offset - ", currTE, " - plus_5end vs minus_5end",sep="")
      # fileNAME = paste(outDIR,currTE, ".plus.noFilter.pdf",sep="")
    
      # pdf(file=fileNAME,width=7,height=6)
      # plot(sum_offset,
      #      main=plotNAME,
      #      xlab="offset [nt]",
      #      ylim=c(0,0.6))
      # dev.off()
    
      sum_offset2 = sum_offset[c(1:(LINK.position-1),(LINK.position+1):SPAN)]
      m = mean(sum_offset2)
      M = median(sum_offset2)
      s = sd(sum_offset2)
      z = (sum_offset[LINK.position]-m)/s
      sum_offset3 = c(z,sum_offset)
    }

    if(m > 0){
      # Write table regardless of whether we found valid pairs
      print(paste("Writing table for TE:", currTE," with z-score:", z, " mean:", m, " median:",M, " sd:", s))
      results_old <- rbind(results_old, data.frame(
          TE = currTE,
          mean = m,
          median = M,
          sd = s,
          z_score = z,
          stringsAsFactors = FALSE
      ))    }
    m=0
  }
}
## [1] "Writing table for TE: FBgn0010225  with z-score: 0.0269791817563011  mean: 0.049948161756928  median: 0.047070661495624  sd: 0.0384283285833402"
library(parallel)
library(tictoc)
library(data.table)
## Warning: package 'data.table' was built under R version 4.3.3
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:tictoc':
## 
##     shift
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:h2o':
## 
##     hour, month, week, year
## The following object is masked from 'package:DescTools':
## 
##     %like%
START=0
END=19
POSITION=9
START.position_B=1-START
LINK.position=POSITION-START+1
SPAN=END-START+1
THRESHOLD=0.01

# Convert to data.table for faster filtering
setDT(RAW)

# Get first 100 unique TEs
TEs = RAW[, unique(TE)][1:100]
TEs = RAW[, unique(TE)]

# Pre-create index ranges to avoid repeated calculations
span_seq <- 1:SPAN
start_end_seq <- START:END

# Optimized processing function
process_TE <- function(currTE) {
  # Use data.table filtering
  TABLE <- RAW[TE == currTE]
  
  LENGTH <- nrow(TABLE)
  
  if(LENGTH <= 30) return(NULL)
  
  END.position_B <- LENGTH - END
  j_range <- START.position_B:END.position_B
  j_len <- length(j_range)
  
  # Use get() for columns with special characters
  table_a <- get("5END_plus", TABLE)
  table_b <- get("5END_minus", TABLE)
  
  # Vectorized denominator calculation with matrix operations
  idx_matrix <- outer(j_range, start_end_seq, "+")
  denominators <- rowSums(matrix(table_a[idx_matrix], nrow = j_len))
  
  # Pre-allocate offset matrix
  offset <- matrix(0, nrow = j_len, ncol = SPAN)
  
  # Vectorized offset calculation
  mask <- table_a[j_range] > THRESHOLD
  for(i in span_seq) {
    idx <- j_range + START + i - 1
    offset[, i] <- ifelse(mask & denominators > 0,
                          table_a[j_range] * table_b[idx] / denominators,
                          0)
  }
  
  total_offset_sum <- sum(offset)
  
  if(total_offset_sum == 0) return(NULL)
  
  # Fast column sums
  sum_offset <- .colSums(offset, j_len, SPAN) / total_offset_sum
  
  # Calculate z-score
  sum_offset2 <- sum_offset[-LINK.position]
  m <- mean(sum_offset2)
  M <- median(sum_offset2)
  
  if(M == 0 ) return(NULL)
  
  s <- sd(sum_offset2)
  z <- (sum_offset[LINK.position] - m) / s
  
  list(TE = currTE, responder = sum_offset[LINK.position], mean = m, median = M,  sd = s, z_score = z)
}

tic("Total processing time")

# Parallel processing
results_list <- mclapply(TEs, process_TE, 
                         mc.cores = detectCores() - 1,
                         mc.preschedule = FALSE,
                         mc.set.seed = FALSE)


# Fast combination with data.table
results_df <- as.data.frame(rbindlist(results_list))

total_time <- toc()
## Total processing time: 1149.751 sec elapsed
cat("\n========================================\n")
## 
## ========================================
cat("Processing complete!\n")
## Processing complete!
cat("========================================\n")
## ========================================
cat("Total TEs processed:", length(TEs), "\n")
## Total TEs processed: 10135
cat("Valid results:", nrow(results_df), "\n")
## Valid results: 737
cat("Failed/skipped TEs:", length(TEs) - nrow(results_df), "\n")
## Failed/skipped TEs: 9398
cat("Total time:", round(total_time$toc - total_time$tic, 2), "seconds\n")
## Total time: 1149.75 seconds
cat("Time per TE:", round((total_time$toc - total_time$tic) / length(TEs), 4), "seconds\n")
## Time per TE: 0.1134 seconds
cat("========================================\n\n")
## ========================================
summary_stats <- results_df %>%
  mutate(TYPEdetail = if_else(grepl("FBgn", TE), "GENE", "TE")) %>%
  group_by(TYPEdetail) %>%
  summarise(
    mean_z = mean(z_score),
    median_z = median(z_score),
    .groups = "drop"
  )

cat("Summary of z-scores:\n")
## Summary of z-scores:
print(summary_stats)
## # A tibble: 2 × 3
##   TYPEdetail mean_z median_z
##   <chr>       <dbl>    <dbl>
## 1 GENE        0.619   -0.102
## 2 TE          6.90     7.33
cat("\n")
write_tsv(results_df, "GL_pingpong_results_5pplus_5pminus.txt")
p = results_df %>%
  filter( responder>0.01, median > 0.01, !grepl("random",TE))%>%
  mutate(
    TYPEdetail = case_when(
      grepl("FBgn",TE) ~ "GENE",
      TRUE ~ "TE"
    )
  )%>%
  ggplot(aes(x=TYPEdetail, y=z_score))+
  geom_quasirandom()+
      stat_summary(
    fun = median, 
    geom = "crossbar", 
    width = 0.5, 
    fatten = 1.5, 
    color = "red"
  ) +
  stat_summary(
  fun = median,
  geom = "text",
  aes(label = sprintf("%.2f", after_stat(y))),
  vjust = -0.5,
  size = 3,
  color = "red"
  )
  # scale_y_log10()


print(p)

ggsave("GL_pingpong.pdf", width = 6, height = 4)

results_df %>%
  filter( responder>0.01, median > 0.01)%>%
  mutate(
    TYPEdetail = case_when(
      grepl("FBgn",TE) ~ "GENE",
      TRUE ~ "TE"
    )
  )%>%
  group_by(TYPEdetail) %>%
  summarise(
    mean_z = mean(z_score),
    median_z = median(z_score),
    .groups = "drop"
  )
## # A tibble: 2 × 3
##   TYPEdetail mean_z median_z
##   <chr>       <dbl>    <dbl>
## 1 GENE        0.947    0.303
## 2 TE          7.00     7.36
transcriptQUANT = read_tsv("transcript_quantification.txt")
## Rows: 11159 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): ID
## dbl (8): average-AUBshAGO3sh_nGFP-Piwi-PiwiIP_new_sense, average-AUBshAGO3sh...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1 phasing

library(parallel)
library(tictoc)
library(data.table)
library(dplyr)

# Parameters
WINDOW_SIZE <- 10
THRESHOLD <- -2

setDT(RAW)
TEs <- RAW[, unique(TE)]
# TEs <- RAW[, unique(TE)][1:100]

process_TE_phasing_fast <- function(currTE) {
  # Fast filtering
  DT <- RAW[TE == currTE]
  if(nrow(DT) <= 30) return(NULL)
  
  analyze_strand <- function(t3, t5, start_off, end_off, target_off) {
    len <- length(t3)
    span <- end_off - start_off + 1
    link_idx <- target_off - start_off + 1
    
    # 1. Fast Denominator: Rolling sum of 3' ends across the window
    # frollsum is extremely fast C-based rolling sum
    denom <- frollsum(t3, n = span, align = "center", fill = 0)
    
    # 2. Vectorized Offset Calculation
    # Instead of looping over positions, we loop over offsets (only 21 iterations)
    # We shift the 5' end vector relative to the 3' end vector
    
    profile <- numeric(span)
    valid_mask <- t3 > THRESHOLD & denom > 0
    
    for (i in 1:span) {
      shift <- start_off + i - 1
      
      # Shift t5 vector to align with t3 at this specific offset
      if (shift > 0) {
        t5_shifted <- c(t5[-(1:shift)], rep(0, shift))
      } else if (shift < 0) {
        t5_shifted <- c(rep(0, abs(shift)), t5[1:(len + shift)])
      } else {
        t5_shifted <- t5
      }
      
      # Calculate weighted signal for this offset across all positions at once
      profile[i] <- sum((t3[valid_mask] * t5_shifted[valid_mask]) / denom[valid_mask], na.rm = TRUE)
    }
    
    total_signal <- sum(profile)
    if (total_signal == 0) return(NULL)
    
    norm_profile <- profile / total_signal
    
    # 3. Z-score
    bg <- norm_profile[-link_idx]
    m <- mean(bg)
    M <- median(bg)
    s <- sd(bg)
    
    if (is.na(s) || s == 0) return(NULL)
    
    list(z = (norm_profile[link_idx] - m) / s, res = norm_profile[link_idx], m = m, M = m, s = s)
  }
  
  # Sense: -9 to +10, target +1
  p <- analyze_strand(DT[["3END_plus"]], DT[["5END_plus"]], -9, 10, 1)
  # Antisense: -10 to +9, target -1
  m <- analyze_strand(DT[["3END_minus"]], DT[["5END_minus"]], -10, 9, -1)
  
  if (is.null(p) && is.null(m)) return(NULL)
  
  data.frame(
    TE = currTE,
    z_sense = if(!is.null(p)) p$z else NA,
    res_sense = if(!is.null(p)) p$res else NA,
    mean_sense = if(!is.null(p)) p$m else NA,
    median_sense = if(!is.null(p)) p$M else NA,
    sd_sense = if(!is.null(p)) p$s else NA,
    z_antisense = if(!is.null(m)) m$z else NA,
    res_antisense = if(!is.null(m)) m$res else NA,
    mean_antisense = if(!is.null(m)) m$m else NA,
    median_antisense = if(!is.null(m)) m$M else NA,
    sd_antisense = if(!is.null(m)) m$s else NA,
    stringsAsFactors = FALSE
  )
}

tic("Optimized Processing")
# Use mc.preschedule = TRUE for many small TEs to reduce overhead
results_phasing <- mclapply(TEs, process_TE_phasing_fast, 
                    mc.cores = detectCores() - 1, 
                    mc.preschedule = FALSE)
toc()
## Optimized Processing: 1116.785 sec elapsed
# Combine results
 results_df_phasing <- rbindlist(results_phasing) %>%
   filter(!is.na(median_sense) | !is.na(median_antisense)) 

cat("\n========================================\n")
## 
## ========================================
cat("Processing complete!\n")
## Processing complete!
cat("========================================\n")
## ========================================
cat("Total TEs processed:", length(TEs), "\n")
## Total TEs processed: 10135
cat("Valid results:", nrow(results_df_phasing), "\n")
## Valid results: 6802
cat("Total time:", round(total_time$toc - total_time$tic, 2), "seconds\n")
## Total time: 1149.75 seconds
cat("========================================\n\n")
## ========================================
if(nrow(results_df_phasing) > 0) {
  summary_stats <- results_df_phasing %>%
    mutate(TYPEdetail = if_else(grepl("FBgn", TE), "GENE", "TE")) %>%
    group_by(TYPEdetail) %>%
    summarise(
      mean_z_sense = mean(z_sense, na.rm = TRUE),
      median_z_sense = median(z_sense, na.rm = TRUE),
      mean_z_antisense = mean(z_antisense, na.rm = TRUE),
      median_z_antisense = median(z_antisense, na.rm = TRUE),
      .groups = "drop"
    )
  
  cat("Summary of z-scores:\n")
  print(summary_stats)
  
  write_tsv(results_df_phasing, "GL_phasing_results.txt")
}
## Summary of z-scores:
## # A tibble: 2 × 5
##   TYPEdetail mean_z_sense median_z_sense mean_z_antisense median_z_antisense
##   <chr>             <dbl>          <dbl>            <dbl>              <dbl>
## 1 GENE               1.24         -0.177            0.841             -0.229
## 2 TE                 5.81          5.73             6.52               6.58
p = results_df_phasing %>%
  mutate(
    TYPEdetail = case_when(
      grepl("FBgn",TE) ~ "GENE",
      TRUE ~ "TE"
    )
  )%>%
  filter(res_sense>0.1, median_sense>0.01)%>%
  ggplot(aes(x=TYPEdetail, y=z_sense))+
  geom_quasirandom()+
    stat_summary(
    fun = median, 
    geom = "crossbar", 
    width = 0.5, 
    fatten = 1.5, 
    color = "red"
  ) +
  stat_summary(
  fun = median,
  geom = "text",
  aes(label = sprintf("%.2f", after_stat(y))),
  vjust = -0.5,
  size = 3,
  color = "red"
  )
  
print(p)

ggsave("GL_phasing.pdf", width = 6, height = 4)

results_df_phasing %>%
  mutate(
    TYPEdetail = case_when(
      grepl("FBgn",TE) ~ "GENE",
      TRUE ~ "TE"
    )
  )%>%
  filter(res_sense>0.01, median_sense>0.01)%>%
  group_by(TYPEdetail) %>%
  summarise(
    mean_z = mean(z_sense),
    median_z = median(z_sense),
    .groups = "drop"
  )
## # A tibble: 2 × 3
##   TYPEdetail mean_z median_z
##   <chr>       <dbl>    <dbl>
## 1 GENE         2.05     1.48
## 2 TE           5.90     5.87
print(p)