diff --git a/crosscorrelation.Rmd b/crosscorrelation.Rmd
new file mode 100644
index 00000000..c5d296a7
--- /dev/null
+++ b/crosscorrelation.Rmd
@@ -0,0 +1,384 @@
+---
+title: "uploadid"
+author: "Amenze"
+date: "June 28, 2018"
+output: html_document
+---
+
+```{r}
+setwd("/Users/Amenze/Desktop/tidepool/refdata")
+```
+
+
+```{r }
+#library(data.table)
+library(ggplot2)
+library(plyr)
+library(dplyr)
+#using jmotif
+library(jmotif)
+#install.packages("RecordLinkage")
+library(RecordLinkage)
+#install.packages("PTXQC")
+library("PTXQC")
+library(stringr)
+library(zoo)
+
+
+
+```
+
+
+
+
+```{r}
+
+# create a list from these files
+list.filenames<-list.files("/Users/Amenze/Desktop/tidepool/refdata",pattern=".csv$")
+
+
+#extract files based on duplicated utctime
+for (i in 1:length(list.filenames))
+{
+  patient<-read.csv(list.filenames[i])
+  patient_cbg<-subset(patient,patient$type=="cbg")
+  if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg))
+      write.csv(patient_cbg,paste0("/Users/Amenze/Desktop/tidepool/refdata/duplicated",list.filenames[i]))
+
+ }
+```
+
+
+
+
+
+```{r}
+##read in file and check for duplicated utc
+
+field<-c("deviceId","id","uploadId","utcTime","type","value")
+patient<-read.csv("duplicated0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv")[,field]
+patient$mgvalue<-patient$value*18.01559
+length(unique(patient$uploadId)) #1319
+length(unique(patient$id))==nrow(patient) ##True
+length(unique(patient$utcTime))#72287
+length(unique(patient$utcTime))==nrow(patient) #False
+
+
+
+###subset patient file based on  5 uploadids
+df1<-subset(patient,patient$uploadId=="upid_3c41703c2d3a8b97f479afdb6ccf799f")
+
+df2<-subset(patient,patient$uploadId=="upid_3fc32e5ad912a8ea7efced9151804bdb")
+
+
+df3<-subset(patient,patient$uploadId=="upid_17db2d2a0ae0e02a12c0a5067e5fe85b")
+
+
+df4<-subset(patient,patient$uploadId=="upid_5fad608cf32bd03a1cd56e3bb1fdb834")
+
+
+
+df5<-subset(patient,patient$uploadId=="upid_830c6de3e2ecbbec6fbad0cecc64bdf5")
+
+#plot values for each uploadid
+par(mfrow=c(3,3))
+
+
+plot.ts(df1$value)
+plot.ts(df2$value)
+plot.ts(df3$value)
+plot.ts(df4$value)
+plot.ts(df5$value)
+
+```
+
+
+
+```{r}
+## input
+#x:vector value for uploadid x 
+#y: vector value for uploadid y
+#outputs 
+#longeststring:longest common substring match between discretize value of x and y respectively p1 & p2
+#percentage similarity
+#alphaxy:the alphabet size
+
+stringcnvert<-function(x,y,alphaxy){
+  if (length(x)!=length(y)){
+    normvalue=x
+  }
+  else if(length(x)==length(y)){
+    normvalue=x
+  }
+  normvalue.mean <- mean(normvalue)
+  normvalue.dev<-sd(normvalue)
+  xznorm<-(x - normvalue.mean)/normvalue.dev
+  yznorm<-(y - normvalue.mean)/normvalue.dev
+  y_paa1 = paa(xznorm,length(x)) ##we decided to use the exact length of the vector based on what we want to achieve
+  y_paa2 = paa(yznorm,length(y))
+  xstringvalue<-series_to_string(y_paa1, alphaxy)
+  ystringvalue<-series_to_string(y_paa2, alphaxy) 
+  p1<-xstringvalue
+  p2<-ystringvalue
+  longeststring<-LCSn(c(p1,p2))
+  return (list(longeststring,p1,p2,(levenshteinSim(xstringvalue,ystringvalue))))
+
+
+}
+
+
+
+
+##input
+#values:output from stringcnvert function
+##ouptputs
+#stringvalue: 2 strings compared 
+#substringrep:matching substring that was compared, 
+#stringdetect:boolean eqivalent of string detected(TRUE:if detected, FALSE:not detected)
+#stringcount:count of substrng in stringvalue 
+#loc:starting and ending indexes for substring in the two strings compared.
+
+duplicateindex<-function(values){
+  stringsvalues <- as.character(c(values[2],values[3]))
+  substringrep <- as.character(values[1])
+  stringdetect<-str_detect(stringsvalues,substringrep)
+  stringcount<-str_count(stringsvalues, substringrep)
+  loc <- str_locate(stringsvalues,substringrep )
+  return(list(stringsvalues,substringrep,stringdetect,stringcount,loc))
+}
+
+
+
+##input
+#x & y:dataframe
+#strngindex: output from  duplicateindex function(loc)
+##ouptputs
+#s1:numeric vector ,
+#s2:numeric vector, 
+#index1:indexes of s1, 
+#index2:indexes of s2
+#
+extractindex<-function(x,y,strngindex){
+  
+   s1<-x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],"value"]
+   s2<-y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],"value"]
+   indexs1<-row.names(x[strngindex[5][[1]][[1]]:strngindex[5][[1]][[3]],])
+   indexs2<-row.names(y[strngindex[5][[1]][[2]]:strngindex[5][[1]][[4]],])
+
+  return(list(s1,s2,indexs1,indexs2))
+
+}
+
+
+###input
+#x:a list of indexes
+##ouptputs
+#indexnum1:numeric equivalent of indexes
+
+extractvalue<-function(x){
+  for (i in 1:length(x)[[1]])
+  indexnum <- c(x[[i]]) 
+  indexnum1<-as.numeric(indexnum)
+return(indexnum1)
+}
+
+##main
+#
+system.time(values<-stringcnvert(df1$mgvalue,df2$mgvalue,3))#http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better.
+#user  system elapsed 
+#   0.11    0.00    0.11 
+
+system.time(strngindex<-duplicateindex(values))
+validate<-extractindex(df1,df2,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+# par(mfrow=c(1,2))
+# plot.ts(compare$df1_mgvalue)
+# plot.ts(compare$df2_mgvalue)
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+
+
+```
+
+
+
+```{r}
+values<-stringcnvert(df3$mgvalue,df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run
+strngindex<-duplicateindex(values)
+validate<-extractindex(df3,df4,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+# par(mfrow=c(1,2))
+# plot.ts(compare$df1_mgvalue)
+# plot.ts(compare$df2_mgvalue)
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+```
+
+
+
+
+```{r}
+patient<-read.csv("duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv")[,field]
+patient$mgvalue<-patient$value*18.01559
+length(unique(patient$uploadId)) #2
+length(unique(patient$id))==nrow(patient) ##True
+length(unique(patient$utcTime))# 8899
+anyDuplicated(patient$utcTime) #4366
+length(unique(patient$utcTime))==nrow(patient) #False
+
+
+
+df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2")
+
+
+df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5")
+  
+
+
+```
+
+
+```{r}
+values<-stringcnvert(df1$mgvalue,df2$mgvalue,8)
+strngindex<-duplicateindex(values)
+validate<-extractindex(df1,df2,strngindex)
+seq1<-extractvalue(validate[3])
+seq2<-extractvalue(validate[4])
+
+df1_mgvalue<-patient[seq1,"mgvalue"]
+df2_mgvalue<-patient[seq2,"mgvalue"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+
+
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#cross correlation
+```{r}
+corre<-function(x,y,lagsize){
+  corr<-ccf(x$mgvalue,y$mgvalue,max.lag=lagsize,plot=TRUE)
+  max.value<-max(corr$acf)
+  max.lag<-corr$lag[which(corr$acf==max.value)]
+  xlength<-length(x$mgvalue)
+  ylength<-length(y$mgvalue)
+  if ((max.lag==0) & (xlength<ylength)) {
+    computelength<-xlength
+    xvalue<-x
+    yvalue<-y[max.lag:computelength,]
+  } 
+  else if ((max.lag==0)& (ylength<xlength)){
+    computelength=ylength
+    xvalue=x[1:computelength,]
+    yvalue=y
+  }
+  else if((max.lag==0) & (xlength==ylength)){
+    xvalue=x
+    yvalue=y
+  }
+  
+  if ((max.lag>0)& (xlength<ylength)){
+    startcomputelength=max.lag
+    endcomputelength<-xlength-max.lag
+     xvalue=x[max.lag:endcomputelength,]
+     yvalue=y[max.lag:endcomputelength,]
+   }
+   if ((max.lag>0)&(ylength<xlength)){
+     startcomputelength<-max.lag
+     endcomputelength<-ylength-max.lag
+     xvalue<-x[max.lag:endcomputelength,]
+     yvalue<-y[max.lag:endcomputelength,]
+   }
+   else if((max.lag>0) & (xlength==ylength)){
+    startcomputelength<-max.lag
+    xvalue=x[max.lag:length(x),]
+    yvalue=y[max.lag:length(y),]
+   }
+  if (max.lag<0){
+     startcomputelength<-abs(max.lag)
+     xendcomputelength<-xlength-startcomputelength
+     yendcomputelength<-ylength-startcomputelength
+     xvalue<-x[xendcomputelength:1,]
+     yvalue<-y[ylength:(startcomputelength+1),]
+  }
+  
+  return (list(df1=xvalue,df2=yvalue,max_correlation=max.value))
+  
+}
+ 
+system.time(corr<-corre(df1,df2,10))
+df1adj<-corr$df1
+df2adj<-corr$df2
+ 
+#ccf(df1$mgvalue,df2$mgvalue)
+
+
+##extracte the index
+
+t1indexvalue<-row.names(df1adj)
+t2indexvalue<-row.names(df2adj)
+ts1<-patient[t1indexvalue,"mgvalue"]
+ts2<-patient[t2indexvalue,"mgvalue"]
+
+plot.zoo(cbind(ts1,ts2), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+sum(ts1-ts2)
+
+
+```
+
+
+```{r}
+corr<-corre(df3,df4,10)
+df1adj<-corr$df1
+df2adj<-corr$df2
+ 
+
+
+```
+
+
+
diff --git a/deduplication.Rmd b/deduplication.Rmd
new file mode 100644
index 00000000..159c9729
--- /dev/null
+++ b/deduplication.Rmd
@@ -0,0 +1,196 @@
+---
+title: "De-duplication"
+author: "Amenze Okpah"
+date: "June 28, 2018"
+output: html_document
+---
+
+compare similarity between two numeric vectors using Symbolic Aggregate approXimation (SAX).
+Symbolic Aggregate approXimation (SAX) algorithm application to the input time series transforms its into a strings.
+The algoithm was proposed by Lin et al.) and extends the PAA-based approach inheriting the original algorithm simplicity and low computational complexity while providing satisfactory sensitivity and selectivity in range query processing. Moreover, the use of a symbolic representation opened a door to the existing wealth of data-structures and string-manipulation algorithms in computer science such as hashing, regular expression, pattern matching, suffix trees, and grammatical inference.
+#http://www.cs.ucr.edu/~eamonn/SAX.htm
+#https://jmotif.github.io/sax-vsm_site/morea/algorithm/PAA.html
+
+```{r}
+# create a list from these files
+dir <- choose.dir(default = "", caption = "Select folder")
+list.filenames<-list.files(dir,pattern=".csv$")
+#extract files based on duplicated utctime
+for (i in 1:length(list.filenames))
+{
+  patient<-read.csv(list.filenames[i])
+  patient_cbg<-subset(patient,patient$type=="cbg")
+  if ((length(unique(patient_cbg$utcTime)))!=nrow(patient_cbg))
+      write.csv(patient_cbg,paste0(dir,list.filenames[i]))
+ }
+```
+  # Coverts two vectors to their string equivalent
+  # Args:
+  # x:vector of numeric values for uploadid1 
+  # y:vector of numeric values for uploadid2 
+  # alpha.Size: alphabet size(number of character that will be used to discretized numeric values)
+  # Returns:
+  #   longeststring:longest common substring match between discretized valuea of x and y respectively stringX,stringY
+  # percentage similarity between vector x and y
+
+#Note: since PAA takes normalized values,vectors are normalized using the mean and standard deviation of either vector (x or y)
+#PAA:the length of PAA values are fixed to length of the vectors to avoid reducing the dimensions since all values are needed to check for duplication.
+```{r}
+StringConvert <- function(x, y, alpha.Size){
+  if (length(x) != length(y)){
+    normvalue = x
+  }
+  else if(length(x) == length(y)){
+    normvalue = x
+  }
+  normvalue.Mean <- mean(normvalue)
+  normvalue.Dev <- sd(normvalue)
+  xnormalized <- (x - normvalue.Mean) / normvalue.Dev
+  ynormalized <- (y - normvalue.Mean) / normvalue.Dev
+  X.PAA = paa(xnormalized, length(x)) 
+  y.PAA = paa(ynormalized,  length(y))
+  xString.Value <- series_to_string(X.PAA, alpha.Size)
+  yString.Value <- series_to_string(y.PAA, alpha.Size) 
+  stringX <- xString.Value
+  stringY <- yString.Value
+  longeststring<-LCSn(c(stringX,stringY))
+  return (list(longeststring, stringX, stringY, (levenshteinSim(xString.Value, yString.Value))))
+}
+```
+#computes the range of matching character between string values and common subsequence
+  Args:
+    values:output from stringcnvert function 
+  Returns:
+     Loc:Matching Ranges
+```{r}
+IndexRange<-function(values){
+  strings.Values <- as.character(c(values[2],values[3]))
+  common.SubSequence <- as.character(values[1])
+  loc <- str_locate(strings.Values, common.SubSequence)
+  return(list(loc))
+}
+```
+##computes the duplicated values for each vectors and their indexes 
+  # Args:
+  # x & y: vectors
+  # Returns:
+  #   duplicated.X:duplicated value for vextor x
+  # duplicated.Y:duplicated value for vextor y
+  # index.X:duplicated values index for vector x
+  # index.Y:duplicated values index for vector y
+```{r}
+ExtractDuplicateIndex <- function(x, y, IndexRange){
+   duplicate.X <- x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2], "value"]
+   duplicate.Y <- y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2], "value"]
+   index.X <- row.names(x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2],])
+   index.Y <- row.names(y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2],])
+return(list(duplicate.X, duplicate.Y, index.X, index.Y))
+}
+```
+##computes a list of indexes 
+  Args:
+    x: vectors
+  Returns:
+    duplicate.Index:duplicated value index
+```{r}
+ExtractVectorIndex<-function(x){
+  for (i in 1:length(x))
+  indexnum <- c(x[[i]]) 
+  duplicate.Index<-as.numeric(indexnum)
+return(duplicate.Index)
+}
+```
+#import files and subset data based on uploadIds to test  function
+
+```{r}
+##read in file and check for duplicated utcTime
+field <- c("deviceId", "id", "uploadId", "utcTime", "type", "value")
+patient <- read.csv("FileName ")[,field] #0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv
+patient$mgvalue <- patient$value*18.01559
+
+#check for unique uploadIds
+uniqueid <- function(df){
+   for (i in df["uploadId"]){
+     Id <- unique(df$utcTime)
+   return (Id)
+   }
+}
+###subset patient file based on  5 uploadids
+df1 <- subset(patient, patient$uploadId == "upid_3c41703c2d3a8b97f479afdb6ccf799f")
+df2 <- subset(patient, patient$uploadId == "upid_3fc32e5ad912a8ea7efced9151804bdb")
+df3 <- subset(patient, patient$uploadId == "upid_17db2d2a0ae0e02a12c0a5067e5fe85b")
+df4 <- subset(patient, patient$uploadId == "upid_5fad608cf32bd03a1cd56e3bb1fdb834")
+df5 <- subset(patient, patient$uploadId == "upid_830c6de3e2ecbbec6fbad0cecc64bdf5")
+
+#plot values for each uploadid
+par(mfrow=c(3, 3))
+plot.ts(df1$value)
+plot.ts(df2$value)
+plot.ts(df3$value)
+plot.ts(df4$value)
+plot.ts(df5$value)
+
+```
+
+```{r}
+system.time(values <- StringConvert(df1$mgvalue, df2$mgvalue,3))  #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better.
+system.time(strngindex <- IndexRange(values))
+validate <- ExtractDuplicateIndex(df1, df2, strngindex)
+seq1 <- ExtractVectorIndex(validate[3])
+seq2 <- ExtractVectorIndex(validate[4])
+df1_mgvalue <- patient[seq1,"value"]
+df2_mgvalue <- patient[seq2,"value"]
+compare <- data.frame(df1_mgvalue,df2_mgvalue)
+
+#plot duplicated values
+plot.zoo(cbind(compare$df1_mgvalue, compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+sum(compare$df1_mgvalue - compare$df2_mgvalue)
+```
+
+```{r}
+values <- StringConvert(df3$mgvalue, df4$mgvalue,3 ) #http://www.cs.ucr.edu/~eamonn/SAX.pdf alphabet size: 5-8 works better. ##3 worked better for this run
+strngindex <- IndexRange(values)
+validate <- ExtractDuplicateIndex(df3,df4,strngindex)
+seq1 <- ExtractVectorIndex(validate[3])
+seq2 <- ExtractVectorIndex(validate[4])
+
+df1_mgvalue <- patient[seq1,"value"]
+df2_mgvalue <- patient[seq2,"value"]
+compare <- data.frame(df1_mgvalue, df2_mgvalue)
+plot.zoo(cbind(compare$df1_mgvalue, compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+sum(compare$df1_mgvalue - compare$df2_mgvalue)
+```
+
+```{r}
+patient <- read.csv("filename")[,field] #0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv
+patient$mgvalue <- patient$value*18.01559
+length(unique(patient$uploadId)) 
+length(unique(patient$id))==nrow(patient) 
+length(unique(patient$utcTime))
+anyDuplicated(patient$utcTime) 
+length(unique(patient$utcTime))==nrow(patient) 
+df1<-subset(patient,patient$uploadId=="2f61322480c841fd8679fe81e94930b2")
+df2<-subset(patient,patient$uploadId=="c05970591b404518a1cbd64595d628e5")
+```
+
+```{r}
+values <- StringConvert(df1$mgvalue,df2$mgvalue,3)
+strngindex <- IndexRange(values)
+validate <- ExtractDuplicateIndex(df1,df2,strngindex)
+seq1 <- ExtractVectorIndex(validate[3])
+seq2 <- ExtractVectorIndex(validate[4])
+
+df1_mgvalue<-patient[seq1,"value"]
+df2_mgvalue<-patient[seq2,"value"]
+compare<-data.frame(df1_mgvalue,df2_mgvalue)
+
+plot.zoo(cbind(compare$df1_mgvalue,compare$df2_mgvalue), 
+         plot.type = "multiple", 
+         col = c("red", "blue"))
+sum(compare$df1_mgvalue-compare$df2_mgvalue)
+
+```
diff --git a/deduplicationFunction.R b/deduplicationFunction.R
new file mode 100644
index 00000000..1f5a0e48
--- /dev/null
+++ b/deduplicationFunction.R
@@ -0,0 +1,72 @@
+StringConvert <- function(x, y, alpha.Size){
+  #
+  # Coverts two vectors string equivalent
+  # Args:
+  #   x:vector value for uploadid x 
+  # y:vector value for uploadid y
+  # alpha.Size: Alphabet size.
+  # Returns:
+  #   longeststring:longest common substring match between discretized value of x and y respectively stringX,stringY
+  # percentage similarity between vector x and y
+  if (length(x) != length(y)){
+    normvalue = x
+  }
+  else if(length(x) == length(y)){
+    normvalue = x
+  }
+  normvalue.Mean <- mean(normvalue)
+  normvalue.Dev <- sd(normvalue)
+  xnormalized <- (x - normvalue.Mean) / normvalue.Dev
+  ynormalized <- (y - normvalue.Mean) / normvalue.Dev
+  X.PAA = paa(xnormalized, length(x)) 
+  y.PAA = paa(ynormalized,  length(y))
+  xString.Value <- series_to_string(X.PAA, alpha.Size)
+  yString.Value <- series_to_string(y.PAA, alpha.Size) 
+  stringX <- xString.Value
+  stringY <- yString.Value
+  longeststring<-LCSn(c(stringX,stringY))
+  return (list(longeststring, stringX, stringY, (levenshteinSim(xString.Value, yString.Value))))
+}
+
+
+IndexRange<- function(values){
+  #computes the range of matching character between string values and common subsequence
+  # Args:
+  #   values:output from stringcnvert function 
+  # Returns:
+  #   Loc:Matching Ranges
+  strings.Values <- as.character(c(values[2],values[3]))
+  common.SubSequence <- as.character(values[1])
+  loc <- str_locate(strings.Values, common.SubSequence)
+  return(list(loc))
+}
+
+
+ExtractDuplicateIndex <- function(x, y, IndexRange){
+  ##computes the duplicated values for each vectors and their indexes 
+  # Args:
+  #   x & y: vectors
+  # Returns:
+  #   duplicated.X:duplicated value for vextor x
+  # duplicated.Y:duplicated value for vextor y
+  # index.X:duplicated values index for vector x
+  # index.Y:duplicated values index for vector y
+  duplicate.X <- x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2], "value"]
+  duplicate.Y <- y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2], "value"]
+  index.X <- row.names(x[IndexRange[[1]][1,][1] : IndexRange[[1]][1,][2],])
+  index.Y <- row.names(y[IndexRange[[1]][2,][1] : IndexRange[[1]][2,][2],])
+  return(list(duplicate.X, duplicate.Y, index.X, index.Y))
+}
+
+
+ExtractVectorIndex<- function(x){
+  ##computes a list of indexes 
+  # Args:
+  #   x: vectors
+  # Returns:
+  #   duplicate.Index:duplicated value index
+  for (i in 1:length(x))
+    indexnum <- c(x[[i]]) 
+  duplicate.Index<-as.numeric(indexnum)
+  return(duplicate.Index)
+}
\ No newline at end of file
diff --git a/deduplication_distance.py b/deduplication_distance.py
new file mode 100644
index 00000000..40ddffd3
--- /dev/null
+++ b/deduplication_distance.py
@@ -0,0 +1,337 @@
+import numpy as np
+import pandas as pd
+"""
+Data Deduplication of Continous blood glucose. Final output are indexes of duplicated values, duplicated values and a plot of the values
+"""
+"""
+Test data  from two files
+files:0289cfb8bd6d61ccf1f31c07aa146b7b14f0eb74474be4311860d9d77dd30f15.csv
+      0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c.csv
+"""
+#Test data1
+##data = pd.read_csv("C:/Python27/test2.csv",delimiter=',')
+##print data
+##df1=data.loc[data['alp'] =='x', ['value']]
+##df2=data.loc[data['alp'] =='y', ['value']]
+#Test data2
+#data = pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/data.csv",delimiter=',')
+##df1 = data.loc[data['uploadId'] =='upid_3c41703c2d3a8b97f479afdb6ccf799f', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='upid_3fc32e5ad912a8ea7efced9151804bdb', ['utcTime','value']]
+#Test data3
+##df1 = data.loc[data['uploadId'] =='upid_17db2d2a0ae0e02a12c0a5067e5fe85b', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
+#Test data4
+##df1 = data.loc[data['uploadId'] =='upid_5fad608cf32bd03a1cd56e3bb1fdb834', ['utcTime','value']]
+##df2 = data.loc[data['uploadId'] =='upid_830c6de3e2ecbbec6fbad0cecc64bdf5', ['utcTime','value']]
+
+#Test data5
+data=pd.read_csv("C:/Users/Amenze/Desktop/tidepool/refdata/duplicated0fe539475b52ae23f939d7dd2596cf8eb1e877edcea0478f2df73bb98bd5937c2.csv",delimiter=',')
+df1 = data.loc[data['uploadId'] =='2f61322480c841fd8679fe81e94930b2', ['utcTime','value']]
+df2 = data.loc[data['uploadId'] =='c05970591b404518a1cbd64595d628e5', ['utcTime','value']]
+
+def Distances(x,y):
+   """
+   Compute the distance matrix for vector x and y
+   Args:
+    x:vector value for uploadid x 
+    y:vector value for uploadid y
+   Returns:
+     distances: distance matrix of x and y
+   """
+   if len(y) > len(x):
+      leny = len(y)
+      lenx = len(x)
+      xval = x
+      yval = y
+   elif len(x) > len(y):
+      leny = len(x)
+      lenx = len(y)
+      xval = y
+      yval = x
+   elif len(y) == len(x):
+      lenx = len(x)
+      leny = len(y)
+      xval = x
+      yval = y
+   distances= [[0] * lenx for i in range(leny)]
+   for i in range(leny):  
+     for j in range(lenx):   
+         distances[i][j] = ((xval[j])-(yval[i]))**2
+   return distances
+
+def DiagonalList(dis):
+   """
+   Find the diagonal with the highest count of zero  and the diagonal start index
+   Args:
+    dis: distance matrix
+   Returns:
+     diagonal: diagonal with higest count of zero
+     diagonalStartIndex:Start Index of diagonal
+   """
+   matrix=np.array(dis)
+   j=-len(dis)
+   x=len(dis[0])+1
+   highestCount=0
+   for i in range(len(dis[0])-1,j,-1): 
+       arr = matrix.diagonal(i)
+       countZero = (arr == 0).sum()
+       if countZero >= highestCount:
+          highestCount = countZero
+          diagonal=arr
+          diagonalStartIndex=abs(i)
+   return (diagonal,diagonalStartIndex)
+
+def DiagonalZero(disMatrix,ts1,ts2,startindex):
+   """
+   Compute the diagonal Index with the highest count of zero (output from DiagonalList)
+   Args:
+    disMatrix: distance matrix
+    ts1:Vector 1
+    ts2:vector 2
+    startindex: the start index for the diagonal
+   Returns:
+     dia.Index:diagonal Index
+     dia.value: diagonal value
+   """
+   diaIndex=[]
+   diaValue=[]
+   if len(ts2) > len(ts1):
+      leny = len(ts2)
+      j = len(ts1)-1
+   elif len(ts1) > len(ts2):
+      leny = len(ts1)
+      j = len(ts2)-1
+   elif len(ts1) == len(ts2):
+      lenx = len(ts1)
+      leny = len(ts2)
+      j = len(ts1)-1
+   i = startindex
+   k = 0
+   while i < leny and k <= j:
+       if disMatrix[i][k] >= 0:
+          diaIndex += [[i,k]]
+          diaValue += [disMatrix[i][k]]  
+          k = k+1
+          i = i+1  
+   return (diaIndex,diaValue)
+
+def zero_runs(diaValue): #https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array
+   """
+   Args:
+   diaValue: diagonal values returned from function diagonalzero
+   Returns:
+    ranges: list of consecutive zero ranges in the diagonal
+   """
+   iszero = np.concatenate(([0], np.equal(diaValue, 0).view(np.int8), [0]))
+   absdiff = np.abs(np.diff(iszero))
+   ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
+   return ranges
+def CountZero(runs):
+   """
+   Args:
+    runs: list of start and stop index of the consecutive zeros in an array
+   Returns:
+    totalCount:returns the count of consecutive zero
+    countIndex:list index with max zeros
+   """
+   maxcount=0
+   for i in  range(len(runs)):
+        count=runs[i][1]-runs[i][0]
+        if count>=maxcount:
+            maxcount=count
+            countIndex=i
+   totalCount=maxcount
+   return (totalCount,countIndex)
+
+
+def ZeroIndex(runs,dia,runindex):
+    """
+   #accumulate indexes  
+    Args:
+      dia:diagonal indexes returned from function DiagonalZero
+      runs:start and stop indexes
+   Results:
+     indexlst: list of indexes
+    """
+    i=runs[0]
+    j=runs[1]
+    indexlst=[]
+    for i in range(i,j):
+       indexlst+=[dia[i]]
+
+    return indexlst
+
+def DupIndex(x,y,indexzero):
+    """
+    Args:
+    x & y : vectors
+    indexzero:Output from function zeroindex
+    Result:
+     xindex& yindex:matrix indexes for vector x and y
+     ts1dup &ts2dup: duplicated values
+    """
+    yvalue=[]
+    xvalue=[]
+    xdup=[]
+    ydup=[]
+    if len(y)>len(x):
+       yval=y
+       xval=x
+    elif len(x)>len(y):
+      xval=y
+      yval=x
+    elif len(y)==len(x):
+      xval=x
+      yval=y
+    for i in range(len(indexzero)):
+       yvalue+=[indexzero[i][0]]
+    yindex=yvalue
+    for i in range(len(indexzero)):
+       xvalue+=[indexzero[i][1]]
+    xindex=xvalue
+    for i in range(len(xindex)):
+        val=xindex[i]
+        xdup+=[xval[val]] 
+    ts1dup=xdup
+    ts1dup.reverse()
+    for i in range(len(yindex)):
+        val=yindex[i]
+        ydup+=[yval[val]]
+    ts2dup=ydup
+    ts2dup.reverse()
+    return(xindex,yindex,ts1dup,ts2dup)
+   
+def lookupdict(x):
+   """
+   Args:
+     x: vector
+   Returns:
+     indexDict: a dictionary holding the original indexes of vector
+   """
+   indexDict={}
+   for i in range(len(x)):
+       indexDict[i]=x[i]
+   return indexDict
+
+def ExtracteIndex(xDict,yDict,xIndex,yIndex):
+   """
+   Args:
+      xDict & yDict: dictionary from function lookupdict
+      xIndex $ yIndex: line up indexes
+   Returns:
+    xIndexList,yIndexList: a list of original duplicate indexes
+   """
+   xIndexList=[]
+   yIndexList=[]
+   if len(yIndex) > len(xIndex):
+      y=yIndex
+      x=xIndex
+   elif len(xIndex)>len(yIndex):
+      y=xIndex
+      x=yIndex
+     # x.Dict=x
+   elif len(xIndex)==len(yIndex):
+      y=yIndex
+      x=xIndex
+   if len(yDict)>len(xDict):
+      xdict=xDict
+      ydict=yDict
+   elif len(xDict)> len(yDict):
+       xdict=yDict
+       ydict=xDict
+   elif len(xDict)==len(yDict):
+      xdict=xDict
+      ydict=yDict
+   for i in xIndex:
+        xIndexList+=[xdict[i]]
+   for i in yIndex:
+        yIndexList+=[ydict[i]]
+   xIndexList.reverse()
+   yIndexList.reverse()
+   return (xIndexList,yIndexList)
+import time
+start_time = time.time()
+     
+###main###
+print "--------------vector one and two--------------------------"
+ts1=np.array(df1['value'])
+ts2=np.array(df2['value'])
+
+print "--------------original index for vector one and two--------------------------------------"
+ts1index= df1.index
+print "-----------------"
+#print ts1index
+ 
+ts2index=df2.index
+#print ts2index
+####
+print "--------------------distance matrix---------------------"
+dis=Distances(ts1,ts2)
+#print dis
+##
+print "----------------list of diagonals ---------------------------------------------"
+arr,index=DiagonalList(dis)
+startindex=index
+zeroarr=arr
+#print startindex
+#print zeroarr
+      
+print "---------------diagonal index with zero---------------------------------------"
+diaindex,diaval= DiagonalZero(dis,ts1,ts2,startindex)
+diavals=diaval
+diaindexes=diaindex
+#print diavals
+#print diaindexes
+
+runs=zero_runs(diavals)
+
+print "---list of indexes---"
+#print runs
+
+print "----maximum count of zero-------------------------------------"
+sumzero,i=CountZero(runs)
+sumindex=i
+#print sumindex
+maxruns=runs[sumindex]
+#print maxruns
+zeroruns=sumzero
+#print zeroruns
+print "*******************"
+######
+diazero=ZeroIndex(maxruns,diaindexes,sumindex)
+####
+#print diazero
+xindex2,yindex2,ts1value,ts2value=DupIndex(ts1,ts2,diazero)
+######
+xindex=xindex2
+yindex=yindex2
+duplicatedts1=ts1value
+duplicatedts2=ts2value
+#print xindex
+#print yindex
+print duplicatedts1
+print"-----------------------dup1------------------------------------------------------"
+print duplicatedts2
+
+print "-----------dictionary for original index--------------------------------------------------"  
+originalindexts1=lookupdict(ts1index)
+originalindexts2=lookupdict(ts2index)
+#print originalindexts1
+#print originalindexts2
+
+
+print "--------------match to original index-------------------------"
+print ExtracteIndex(originalindexts1,originalindexts2,xindex,yindex) 
+
+print time.time() - start_time, "seconds"
+
+##
+import matplotlib.pyplot as plt
+plt.subplot(2, 1, 1)
+plt.plot(duplicatedts1,'r-')
+plt.ylabel('vector x')
+plt.subplot(2, 1, 2)
+plt.plot(duplicatedts2)
+plt.ylabel('vector y')
+
+plt.show()