forked from MariaNattestad/assemblytics
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAssemblytics_dotplot.R
More file actions
executable file
·129 lines (76 loc) · 5.16 KB
/
Copy pathAssemblytics_dotplot.R
File metadata and controls
executable file
·129 lines (76 loc) · 5.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
library(ggplot2)
args<-commandArgs(TRUE)
prefix <- args[1]
filename <- paste(prefix,".oriented_coords.csv",sep="")
plot.output.filename <- paste(prefix,".Assemblytics.Dotplot_filtered",sep="")
plot.title <- "Dot plot of Assemblytics filtered alignments"
ref.pos <- function(chrom,pos,chr.lengths) {
chrom.index <- which(names(chr.lengths)==chrom)-1
offset.based.on.previous.chromosomes <- 0
if (chrom.index != 0) {
offset.based.on.previous.chromosomes <- sum(as.numeric(chr.lengths[c(1:chrom.index)]))
}
loc <- offset.based.on.previous.chromosomes + pos
loc
}
coords <- read.csv(filename,sep=",",header=TRUE)
if (nrow(coords)>100000) {
coords <- coords[1:100000,]
}
# names(coords) <- c("ref_start", "ref_end","query_start","query_end","ref_length","query_length","ref","query")
coords$ref <- as.character(coords$ref)
coords$query <- as.character(coords$query)
ordered_common_chromosome_names <- c(seq(1,100),paste("chr",seq(1,100),sep=""),paste("Chr",seq(1,100),sep=""),c("X","Y","M","MT","Chr0","chr0","0"))
all_chromosomes_some_ordered <- c(intersect(ordered_common_chromosome_names,unique(coords$ref)),setdiff(unique(coords$ref),ordered_common_chromosome_names))
coords$ref <- factor(coords$ref,levels=all_chromosomes_some_ordered)
chromosomes <- levels(coords$ref)
chr.lengths <- sapply(chromosomes,function(chr){max(coords[coords$ref==chr,"ref_length"])})
names(chr.lengths) <- chromosomes
coords <- cbind(coords, alignment.length=abs(coords$query_start-coords$query_end))
coords <- cbind(coords, ref.loc.start=mapply(FUN=ref.pos,coords$ref,coords$ref_start,MoreArgs=list(chr.lengths)),
ref.loc.stop=mapply(FUN=ref.pos,coords$ref,coords$ref_end,MoreArgs=list(chr.lengths)))
# pick longest alignment. then pick the ref.loc.start of that
query.group <- split(coords,factor(coords$query))
ref.loc.of.longest.alignment.by.query <- unlist(sapply(query.group, function(coords.for.each.query) {coords.for.each.query$ref.loc.start[coords.for.each.query$alignment.length==max(coords.for.each.query$alignment.length)][1]}),recursive=FALSE)
# decide optimal-ish ordering of the queries
ordered.query.names <- names(ref.loc.of.longest.alignment.by.query)[order(ref.loc.of.longest.alignment.by.query)]
# construct a query.lengths list
query.lengths <- sapply(ordered.query.names,function(each.query){
max(coords[coords$query==each.query,"query_length"])
})
# use the query.lengths to give offset positions to each query, adding a query.loc.start column and a query.loc.stop column to each entry in filtered.coords
coords$query.loc.start <- mapply(FUN=ref.pos,coords$query,coords$query_start,MoreArgs=list(query.lengths))
coords$query.loc.stop <- mapply(FUN=ref.pos,coords$query,coords$query_end,MoreArgs=list(query.lengths))
# Hide labels for chromosomes accounting for less than 2% of the reference
chr.labels <- names(chr.lengths)
chr.labels[chr.lengths < 0.02*sum(as.numeric(chr.lengths))] <- ""
query.labels <- names(query.lengths)
query.labels[query.lengths < 0.02*sum(as.numeric(query.lengths))] <- ""
theme_set(theme_bw(base_size = 24))
colors <- c("black","red")
coords$tag <- factor(coords$tag,levels=c("unique","repetitive"))
# CREATE PNG
png(file=paste(plot.output.filename, ".png",sep=""),width=1000,height=1000)
print(ggplot(coords, aes(x=ref.loc.start,xend=ref.loc.stop,y=query.loc.start,yend=query.loc.stop,color=tag)) + geom_segment(lineend="butt",size=1.5) + labs(x="Reference",y="Query",title=plot.title) + scale_y_continuous(breaks = cumsum(as.numeric(query.lengths)),labels=query.labels,expand=c(0,0), limits = c(0,sum(as.numeric(query.lengths)))) + scale_x_continuous(breaks = cumsum(as.numeric(chr.lengths)),labels=chr.labels,expand=c(0,0),limits=c(0,sum(as.numeric(chr.lengths)))) + scale_color_manual(values=colors,name="Filter") + theme(
axis.ticks.y=element_line(size=0),
axis.text.x = element_text(angle = 90, hjust = 1,vjust=-0.5),
axis.text.y = element_text(size=12,vjust=1.1),
plot.title = element_text(vjust=3),
panel.grid.major.x = element_line(colour = "black",size=0.1),
panel.grid.major.y = element_line(colour = "black",size=0.1),
panel.grid.minor = element_line(NA)
))
dev.off()
# CREATE PDF
# pdf(file=paste(plot.output.filename, ".pdf",sep=""),width=100,height=100,res=100)
# print(ggplot(coords, aes(x=ref.loc.start,xend=ref.loc.stop,y=query.loc.start,yend=query.loc.stop)) + geom_segment(lineend="butt",size=1.5) + labs(x="Reference",y="Query",title=plot.title) + scale_y_continuous(breaks = cumsum(as.numeric(query.lengths)),labels=query.labels,expand=c(0,0), limits = c(0,sum(as.numeric(query.lengths)))) + scale_color_manual(values=colors) + scale_x_continuous(breaks = cumsum(as.numeric(chr.lengths)),labels=chr.labels,expand=c(0,0),limits=c(0,sum(as.numeric(chr.lengths)))) + theme(
# axis.ticks.y=element_line(size=0),
# axis.text.x = element_text(angle = 90, hjust = 1,vjust=-0.5),
# axis.text.y = element_text(size=12,vjust=1.1),
# plot.title = element_text(vjust=3),
# panel.grid.major.x = element_line(colour = "black",size=0.1),
# panel.grid.major.y = element_line(colour = "black",size=0.1),
# panel.grid.minor = element_line(NA)
# ))
#
# dev.off()