Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
384 changes: 384 additions & 0 deletions crosscorrelation.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,384 @@
---
title: "uploadid"
author: "Amenze"
date: "June 28, 2018"
output: html_document
---

```{r}
setwd("/Users/Amenze/Desktop/tidepool/refdata")

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this since there's no guarantee anyone else will have the same directory structure.

```


```{r }
#library(data.table)
library(ggplot2)
library(plyr)
library(dplyr)
#using jmotif
library(jmotif)
#install.packages("RecordLinkage")
library(RecordLinkage)
#install.packages("PTXQC")
library("PTXQC")
library(stringr)
library(zoo)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a similar block to this that includes the install.packages(...) for these libraries? It would help someone if they didn't have them installed to just run the block




```




```{r}

# create a list from these files
list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$")

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make this a choose dialog since this directory path isn't going to work on anyone else's computer.



#extract files based on duplicated utctime
for (i in 1:length(list.filenames))
{
patient<-read.csv(list.filenames[i])
patient_cbg<-subset(patient,patient$type=="cbg")
if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg))
write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i]))

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded path....


}
```

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove unnecessary blank lines





```{r}
##read in file and check for duplicated utc

field<-c("deviceId","id","uploadId","utcTime","type","value")
patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field]

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded path...

patient$mgvalue<-patient$value*18.01559
length(unique(patient$uploadId)) #1319
length(unique(patient$id))==nrow(patient) ##True
length(unique(patient$utcTime))#72287
length(unique(patient$utcTime))==nrow(patient) #False



###subset patient file based on 5 uploadids
df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f")

df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb")


df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b")


df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834")



df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5")

#plot values for each uploadid
par(mfrow=c(3,3))


plot.ts(df1$value)
plot.ts(df2$value)
plot.ts(df3$value)
plot.ts(df4$value)
plot.ts(df5$value)

```



```{r}
## input
#x:vector value for uploadid x
#y: vector value for uploadid y
#outputs
#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2
#percentage similarity
#alphaxy:the alphabet size

stringcnvert<-function(x,y,alphaxy){
if (length(x)!=length(y)){
normvalue=x
}
else if(length(x)==length(y)){
normvalue=x
}
normvalue.mean <- mean(normvalue)
normvalue.dev<-sd(normvalue)
xznorm<-(x - normvalue.mean)/normvalue.dev
yznorm<-(y - normvalue.mean)/normvalue.dev
y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve
y_paa2 = paa(yznorm,length(y))
xstringvalue<-series_to_string(y_paa1, alphaxy)
ystringvalue<-series_to_string(y_paa2, alphaxy)
p1<-xstringvalue
p2<-ystringvalue
longeststring<-LCSn(c(p1,p2))
return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue))))


}




##input
#values:output from stringcnvert function
##ouptputs
#stringvalue: 2 strings compared
#substringrep:matching substring that was compared,
#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected)
#stringcount:count of substrng in stringvalue
#loc:starting and ending indexes for substring in the two strings compared.

duplicateindex<-function(values){
stringsvalues <- as.character(c(values[2],values[3]))
substringrep <- as.character(values[1])
stringdetect<-str_detect(stringsvalues,substringrep)
stringcount<-str_count(stringsvalues, substringrep)
loc <- str_locate(stringsvalues,substringrep )
return(list(stringsvalues,substringrep,stringdetect,stringcount,loc))
}



##input
#x & y:dataframe
#strngindex: output from duplicateindex function(loc)
##ouptputs
#s1:numeric vector ,
#s2:numeric vector,
#index1:indexes of s1,
#index2:indexes of s2
#
extractindex<-function(x,y,strngindex){

s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"]
s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"]
indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],])
indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],])

return(list(s1,s2,indexs1,indexs2))

}


###input
#x:a list of indexes
##ouptputs
#indexnum1:numeric equivalent of indexes

extractvalue<-function(x){
for (i in 1:length(x)[[1]])
indexnum <- c(x[[i]])
indexnum1<-as.numeric(indexnum)
return(indexnum1)
}

##main
#
system.time(values<-stringcnvert(df1$mgvalue,df2$mgvalue,3))#http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better.
#user system elapsed
# 0.11 0.00 0.11

system.time(strngindex<-duplicateindex(values))
validate<-extractindex(df1,df2,strngindex)
seq1<-extractvalue(validate[3])
seq2<-extractvalue(validate[4])

df1_mgvalue<-patient[seq1,"mgvalue"]
df2_mgvalue<-patient[seq2,"mgvalue"]
compare<-data.frame(df1_mgvalue,df2_mgvalue)

# par(mfrow=c(1,2))
# plot.ts(compare$df1_mgvalue)
# plot.ts(compare$df2_mgvalue)

plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue),
plot.type = "multiple",
col = c("red", "blue"))


sum(compare$df1_mgvalue-compare$df2_mgvalue)


```



```{r}
values<-stringcnvert(df3$mgvalue,df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run
strngindex<-duplicateindex(values)
validate<-extractindex(df3,df4,strngindex)
seq1<-extractvalue(validate[3])
seq2<-extractvalue(validate[4])

df1_mgvalue<-patient[seq1,"mgvalue"]
df2_mgvalue<-patient[seq2,"mgvalue"]
compare<-data.frame(df1_mgvalue,df2_mgvalue)

# par(mfrow=c(1,2))
# plot.ts(compare$df1_mgvalue)
# plot.ts(compare$df2_mgvalue)

plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue),
plot.type = "multiple",
col = c("red", "blue"))


sum(compare$df1_mgvalue-compare$df2_mgvalue)
```




```{r}
patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field]
patient$mgvalue<-patient$value*18.01559
length(unique(patient$uploadId)) #2
length(unique(patient$id))==nrow(patient) ##True
length(unique(patient$utcTime))# 8899
anyDuplicated(patient$utcTime) #4366
length(unique(patient$utcTime))==nrow(patient) #False



df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2")


df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5")



```


```{r}
values<-stringcnvert(df1$mgvalue,df2$mgvalue,8)
strngindex<-duplicateindex(values)
validate<-extractindex(df1,df2,strngindex)
seq1<-extractvalue(validate[3])
seq2<-extractvalue(validate[4])

df1_mgvalue<-patient[seq1,"mgvalue"]
df2_mgvalue<-patient[seq2,"mgvalue"]
compare<-data.frame(df1_mgvalue,df2_mgvalue)



plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue),
plot.type = "multiple",
col = c("red", "blue"))

sum(compare$df1_mgvalue-compare$df2_mgvalue)


```


Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Blank lines...














#cross correlation
```{r}
corre<-function(x,y,lagsize){
corr<-ccf(x$mgvalue,y$mgvalue,max.lag=lagsize,plot=TRUE)
max.value<-max(corr$acf)
max.lag<-corr$lag[which(corr$acf==max.value)]
xlength<-length(x$mgvalue)
ylength<-length(y$mgvalue)
if ((max.lag==0) & (xlength<ylength)) {
computelength<-xlength
xvalue<-x
yvalue<-y[max.lag:computelength,]
}
else if ((max.lag==0)& (ylength<xlength)){
computelength=ylength
xvalue=x[1:computelength,]
yvalue=y
}
else if((max.lag==0) & (xlength==ylength)){
xvalue=x
yvalue=y
}

if ((max.lag>0)& (xlength<ylength)){
startcomputelength=max.lag
endcomputelength<-xlength-max.lag
xvalue=x[max.lag:endcomputelength,]
yvalue=y[max.lag:endcomputelength,]
}
if ((max.lag>0)&(ylength<xlength)){
startcomputelength<-max.lag
endcomputelength<-ylength-max.lag
xvalue<-x[max.lag:endcomputelength,]
yvalue<-y[max.lag:endcomputelength,]
}
else if((max.lag>0) & (xlength==ylength)){
startcomputelength<-max.lag
xvalue=x[max.lag:length(x),]
yvalue=y[max.lag:length(y),]
}
if (max.lag<0){
startcomputelength<-abs(max.lag)
xendcomputelength<-xlength-startcomputelength
yendcomputelength<-ylength-startcomputelength
xvalue<-x[xendcomputelength:1,]
yvalue<-y[ylength:(startcomputelength+1),]
}

return (list(df1=xvalue,df2=yvalue,max_correlation=max.value))

}

system.time(corr<-corre(df1,df2,10))
df1adj<-corr$df1
df2adj<-corr$df2

#ccf(df1$mgvalue,df2$mgvalue)


##extracte the index

t1indexvalue<-row.names(df1adj)
t2indexvalue<-row.names(df2adj)
ts1<-patient[t1indexvalue,"mgvalue"]
ts2<-patient[t2indexvalue,"mgvalue"]

plot.zoo(cbind(ts1,ts2),
plot.type = "multiple",
col = c("red", "blue"))
sum(ts1-ts2)


```


```{r}
corr<-corre(df3,df4,10)
df1adj<-corr$df1
df2adj<-corr$df2



```



Loading