diff --git a/dedupe/dtw/DRAFT- Write up_ MP java Code.pdf b/dedupe/dtw/DRAFT- Write up_ MP java Code.pdf new file mode 100644 index 00000000..cb407a3f Binary files /dev/null and b/dedupe/dtw/DRAFT- Write up_ MP java Code.pdf differ diff --git a/dedupe/dtw/DistanceMatrixCutOffAssumption.java b/dedupe/dtw/DistanceMatrixCutOffAssumption.java new file mode 100644 index 00000000..407c3bc8 --- /dev/null +++ b/dedupe/dtw/DistanceMatrixCutOffAssumption.java @@ -0,0 +1,371 @@ +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.util.Scanner; + +public class DistanceMatrixCutOffAssumption +{ + static int[] ZeroCounter; //static members of the class which can be accessed anywhere + static int[][] DistanceMatrix; + static int[] LongestZeroString; + static int[][] LZScoordinates; + static int[] DataSetX; + static int[] DataSetY; + + + + public static void main(String[] args) throws FileNotFoundException + { + +//QUICK TEST CASES + // int[] ArrayY = new int[]{1,1,2,2,1,1};//set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayX = new int[]{1,2,2,2,0,1,1,1}; + + // int[] ArrayX = new int[]{1,1,2,3,2,1,0,1};//set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayY = new int[]{0,1,1,2,3,2,1,1,3,1,2}; + + //{0,0,0,1,1};// + //{0,0,1,1,0};// + + + int[] ArrayY = new int[]{1,2,1,1,2,4,4,3}; + int[] ArrayX = new int[]{1,1,1,2,4,4,3,2,2,3,4,5}; + + // int[] ArrayX = new int[]{2,2,3,3,2,2,2,2,2}; //set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayY = new int[]{2,2,2,3,2,2}; + + // int[] ArrayX = new int[]{3,2,3,1}; //GOOD example of finding starting coordinates of starting positions + // int[] ArrayY = new int[]{2,3,1,3,1,3,4,1}; + // int[] ArrayX = new int[]{80,82,85,85,94,90,90,75,77,80,81}; + // int[] ArrayY = new int[]{75,77,80,82,85,85,85,89,94,90,90}; + + + FileInputStream data = new FileInputStream("/Users/samuelrapp/Desktop/DM/datasetTest.txt"); //X + FileInputStream data2 = new FileInputStream("/Users/samuelrapp/Desktop/DM/dataset2Test.txt"); //Y + Scanner scanner = new Scanner(data); + Scanner scanner2 = new Scanner(data2); + + DataSetX = new int[11044];//I looked at the number of lines in the file + DataSetY = new int[11044]; + DataSetX = ArrayX; + DataSetY = ArrayY; + + // int i = 0; + // while(scanner.hasNextLine()) + // { + + // DataSetX[i] = scanner.nextInt(); + // DataSetY[i] = scanner2.nextInt(); + + // i++; + // } + // scanner.close(); + // scanner2.close(); + + + + int HorizontalTraversals = DataSetX.length+DataSetY.length-1; + ZeroCounter = new int[HorizontalTraversals]; //count total zero/matchs in each diagonal traversal + LongestZeroString = new int[HorizontalTraversals]; //tallys the longest continious string of zeros found in any given traversal + LZScoordinates = new int[HorizontalTraversals][2];//records coordinates(in 2 column matrix) 0 index is X pos, 1 index is Y pos (seems reversed for some reason) + + + + DistanceMatrix = new int[DataSetY.length][DataSetX.length]; //row then columns (incrementing the X values chances the column) + PrintMatrix(DistanceMatrix); + DisMatrixFillingDiagonally(DataSetY,DataSetX); + // PrintMatrix(DistanceMatrix); //just a basic print function + // PrintMatrix(LZScoordinates); + System.out.println("RESULT SECTION:"); + System.out.println("1. Array index of most zeros: " + getIndexOfLargest(ZeroCounter)); //index of most exact overlap(ala most matching values/zeros) + System.out.println("2. Array index of longest string of zeros: " + getIndexOfLargest(LongestZeroString)); + System.out.println("3. Starting coordinates of the traversal(on the grid edge) with the longest continuous string of zeros: (x,y)_" + GridEdgeCoordinates(LongestZeroString, DistanceMatrix)); //index of most exact overlap(ala most matching values/zeros) + System.out.println("4. Starting coordinates of the traversal(on the grid edge) with the most total zeros: (x,y)_" + GridEdgeCoordinates(ZeroCounter, DistanceMatrix));//converts the index of greatest traversal to the starting index(on the edge of the grid) of said traversal + DisMatrixHorizontal(ArrayY,ArrayX);//standard double for loop to create the proper distance matrix- to check agaisnt by horizontal creation + PrintMatrix(DistanceMatrix); + + System.out.println("5. Exact X & Y grid coordinates of longest String of Zeros (x,y)_"+ "(" + LZScoordinates[getIndexOfLargest(LongestZeroString)][0] + "," + LZScoordinates[getIndexOfLargest(LongestZeroString)][1] + ")"); + System.out.println("6. Percentage Match relative to perfect match_(most total zeros)_" + PercentMatch(ZeroCounter,DistanceMatrix) + "%"); + System.out.println("7. Percentage Match relative to perfect match_(longest continous string of zeros)_" + PercentMatch(LongestZeroString,DistanceMatrix) + "%"); + + } + +//public static void ReadGoogleSlides() + + public static void DisMatrixFillingDiagonally(int[]DY, int[]DX) + {//arguemnts are the arrays of data points + //Xpos = columns Ypos= rows + int TotalTraversals = DY.length+DX.length-1; //# of diagonal iterations needed to go through every potential unit of the grid. Its the number of edge points we use + //13 is the answer for a 7 by 7 grid. + //its equal to the number of edge values or length + width - 1 + int TraversalsStartingonYAxis = TotalTraversals-DX.length; + int TraversalsStartingonXAxis = TotalTraversals-DY.length; + + int shortersidelength; //determining the longest possible diagonal in the given matrix, ala the length of the shorter data set + boolean LongerSideY; + int difference; + + if(TraversalsStartingonYAxis>TraversalsStartingonXAxis) //the Y axis is bigger than the X axis + { + shortersidelength = TraversalsStartingonXAxis+1; + LongerSideY = true; + } + else ////the X axis is bigger than the Y axis + { + shortersidelength = TraversalsStartingonYAxis+1; + LongerSideY = false; + } + + if(LongerSideY==true) //to make sure I get a positive value when taking the difference. + { + difference = TraversalsStartingonYAxis - TraversalsStartingonXAxis; + } + else + { + difference = TraversalsStartingonXAxis - TraversalsStartingonYAxis; + } + + //int CutOffPoint = (shortersidelength/2);//setting what value with result in a corner cut off + int CutOffPoint = (shortersidelength/100);//lower cut off, less traversals removed. + + int XstartPos = 0; + int YstartPos = 0; + int traversalsFromMiddle = 0; + int ZeroCounter = 0; //index of ZeroCounter array- it is the same as the traversalFromMiddle untill TFM is reset after half the grid is filled + //with evenly sized arrays the X value will always be Odd so we have to account for java rounding down by adding 1 + + for(int i=0; i<(TraversalsStartingonXAxis+1); i++)//goes through as many times are you need to traverse the middle traversal to the bottom left corner. + { + if(LongerSideY==false) //X is the longer side, so it must be accounted for in the assumption + { + if(CutOffPoint>=(shortersidelength+difference-i))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped"); + ZeroCounter = ZeroCounter +1; + continue; + } + } + else + { + if(CutOffPoint>=(shortersidelength-i))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped:::"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + + + traversalsFromMiddle = MatrixHelper(DY, DX, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; //if the Helper returns the traversal count we can use its value even if the method is creating seperate variables + ZeroCounter++; + XstartPos = traversalsFromMiddle; + } + + traversalsFromMiddle=1; + XstartPos = 0;//give new starting edge index values + YstartPos = 1; + +//System.out.println(LongerSideY); + + for(int p=0; p=(shortersidelength-p-1))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + {//+-1 because the values start at (0,1) instead of (0,0) in the for loop above. So there are fewer loops to reach the threshhold + //System.out.println("Skipped->"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + else //Y is the longer side so we have to give it more buffer space + { + if(CutOffPoint>=(shortersidelength+difference-p-1))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped----"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + + + traversalsFromMiddle = MatrixHelper(DY, DX, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; + ZeroCounter++; + YstartPos = traversalsFromMiddle; + } + } + + + /*to know when to make the recursive call A know how many grid positions must be filled before hand and count or B know the final position index you will land at + + if the length/width of the DisMX grid are equal(THE GRID IS A SQUARE) the length of the longest horizontal(corner to corner) requires sidelength amount of sqaure traversals. + The longest traversal of a grid with sidelength of X is X operations. Starting at position (0,0) + starting position (0,y) or (y,0) (where sidelength>y>0) takes Sidelength-Y operations to reach its final grid position + + ex with #s: a 7 by 7 grid's longest traversal is 7 operations long(Starting at position 0,0) + (0,1)=6op (0,2) = 5...(0,6) = 1 same with (1,0)=6op, (2,0)=5op....(6,0)=1op + longest traversal(starting at 0,0) is maximum the length of the shorter side + + + + for a sqaure the longest traversal ends at (sidelength, sidelength) (the perfect diagonal) + starting position (y,0) ends at position (sidelength-1, sidelength-y-1) //moving towards bottom right corner of grid + starting position (0,x) ends at position (sidelength-x-1, sidelength-1) //moving towards top right corner of grid + //we do -1 because the side length starts at 1 but the indexing of the grid starts at 0. so we have to adjust + */ + + + + public static int MatrixHelper(int[]DY, int[]DX, int Xpos, int Ypos, int traversals, int ZerocIndex) + {//the matrix helper function does one complete diagonal traversal of the grid + //It compares values from DY/DX arrays + //int XstartPos, int YstartPos are starting positions for the traversal + //traversals is a arguement that keeps the matrix aware of how far it is from the center diagonal + //ZeroxIndex is the index of the zero counter static array(for data collection) It matches the traversals counter until traversals is reset before the 2nd for loop in DisMatrixFillingDiagonally + + int SidelengthX = DX.length; //.length is the actually amount of positions allocated in memory so index 0-9 is length 10 + int SidelengthY = DY.length; + boolean test = true; + int ActiveZeroCounter = 0; + + while(test==true) + { + int DistanceValue = (DX[Xpos]-DY[Ypos])*(DX[Xpos]-DY[Ypos]); //the difference between the values squared + DistanceMatrix[Ypos][Xpos] = DistanceValue;//put value into Matrix + + if(DistanceValue == 0)//found match (maybe pick a larger range ala 2-6 to allow for puedomatch) + { + ZeroCounter[ZerocIndex] = ZeroCounter[ZerocIndex]+1;//increment index of total zeros in diagonal traversal + ActiveZeroCounter = ActiveZeroCounter+1; + } + else //found non zero value + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//if the String of zeros currently found is greater than what we have previously found for this traversal + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter); + + } + ActiveZeroCounter = 0; //reset the active counter to zero + } + + //leave loop if the array indexs being called match the final position the call reaches + if(Xpos >= (SidelengthX-1) || Ypos >= (SidelengthY-1))//-1 is because array index starts at zero but side length doesn't. + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter; + System.out.println(ActiveZeroCounter); + if(ActiveZeroCounter>0) + LZScoordinates[ZerocIndex][0] = ((Xpos+1) - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = ((Ypos+1) - ActiveZeroCounter); + //the +1 is needed because the ActiveZeroCounter could potentially give you a negative 'coordinate', + //If the zero string makes it all the way to the bottom, the (X/Ypos-activeZeroCounter) < 0. Not possible + //It isn't needed for the strings that don't end on the final position on the traversal because the +1s are happening within the while loop. + //IE. These coordinates are being calculated during the same loop as the final zero is found, but in the case above the coordinates are being calculated in the loop after the final zero is found. + } + + test = false; + } + + Xpos = Xpos+1; + Ypos = Ypos+1; + } + + return traversals; //so we can keep track of the position of the traversal + //we need to know which travesal we have just done. To know where to start the next one + } + + + public static String GridEdgeCoordinates(int[] array, int[][]Matrix) //Gets starting coordinates of traversal based on largest value in array + { + int LargestValueIndex = getIndexOfLargest(array); + String coordinates; + + + if(LargestValueIndex>=(Matrix[1].length)) + { + coordinates = new String("(" + 0 + "," + ((LargestValueIndex-(Matrix[1].length)+1) +") "+ "value: " + array[LargestValueIndex])); + //if the starting value was in the top half of the grid + //this is because the grid is made middle to bottom left corner then back to the middle line to top right corner + } + else + { + coordinates = new String("(" + LargestValueIndex + "," + 0 + ") " + "value: " + array[LargestValueIndex]); + } + + return coordinates; + } + + public static int getIndexOfLargest(int[] array) //returns index of largest value in an array + { + if(array == null || array.length == 0) + { + return -1; // null or empty + } + + int largest = 0; + + for(int i = 1; iarray[largest]) + { + largest = i; + } + } + + return largest; + } + + public static double PercentMatch(int[]Array, int[][]Matrix) + { //takes in one array, and a matrix. Finds the percent match between largest value in array and shorter sidelength of the matrix. + //We are determining how many matchs occured(or consequetively or not), in the best case, vs how many could have + int IndexOfLargest = getIndexOfLargest(Array); + double ValueTotal; + + //no given traversal is longer than the shorter side of the Matrix-thus the longest traversal is = the shorter sidelength + if(Matrix.length > Matrix[1].length) + { + ValueTotal = Matrix[1].length; + } + else + { + ValueTotal = Matrix.length; + } + + double ValueMatchs = (double) Array[IndexOfLargest]; + double PercentMatch = (ValueMatchs/ValueTotal)*100; + return PercentMatch; + } + + public static void PrintMatrix(int[][]Matrix) //prints the Distance Matrix given a Matrix + {//Print out the matrix in nice + for (int i = 0; i < Matrix.length; i++) + { + for (int j = 0; j < Matrix[i].length; j++) + { + System.out.print(Matrix[i][j] + " | "); + } + System.out.println(); + } + System.out.print("_____________________________________"); + System.out.println(); + } + + public static void DisMatrixHorizontal(int[]X,int[]Y) //standard approach to creating the distance matrix via 2 for loop. horizontal traversal not diagonal + { + int[][] DistanceMX = new int[X.length][Y.length]; + for(int i=0; i<(X.length); i++) + { + for(int t=0; t=(shortersidelength-i))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped:::"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + + if(JumpingPreChecker(DY, DX, XstartPos, YstartPos, CutOffPoint)==true) + { + traversalsFromMiddle = MatrixHelper(DY, DX, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; //if the Helper returns the traversal count we can use its value even if the method is creating seperate variables + ZeroCounter++; + XstartPos = traversalsFromMiddle; + } + } + + traversalsFromMiddle=1; + XstartPos = 0;//give new starting edge index values + YstartPos = 1; + +//System.out.println(LongerSideY); + + for(int p=0; p=(shortersidelength-p-1))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + {//+-1 because the values start at (0,1) instead of (0,0) in the for loop above. So there are fewer loops to reach the threshhold + //System.out.println("Skipped->"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + else //Y is the longer side so we have to give it more buffer space + { + if(CutOffPoint>=(shortersidelength+difference-p-1))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped----"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + + if(JumpingPreChecker(DY, DX, XstartPos, YstartPos, CutOffPoint)==true) //this method checks the traversal in increments of the CutOffPoint Length, which is to say in quantities that we truely care about. + {//this jumping prechecker- assumes that we only care about continious strings of zeros rather than total zeros and finding every zero + traversalsFromMiddle = MatrixHelper(DY, DX, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; + ZeroCounter++; + YstartPos = traversalsFromMiddle; + } + } + } + + + /*to know when to make the recursive call A know how many grid positions must be filled before hand and count or B know the final position index you will land at + + if the length/width of the DisMX grid are equal(THE GRID IS A SQUARE) the length of the longest horizontal(corner to corner) requires sidelength amount of sqaure traversals. + The longest traversal of a grid with sidelength of X is X operations. Starting at position (0,0) + starting position (0,y) or (y,0) (where sidelength>y>0) takes Sidelength-Y operations to reach its final grid position + + ex with #s: a 7 by 7 grid's longest traversal is 7 operations long(Starting at position 0,0) + (0,1)=6op (0,2) = 5...(0,6) = 1 same with (1,0)=6op, (2,0)=5op....(6,0)=1op + longest traversal(starting at 0,0) is maximum the length of the shorter side + + + + for a sqaure the longest traversal ends at (sidelength, sidelength) (the perfect diagonal) + starting position (y,0) ends at position (sidelength-1, sidelength-y-1) //moving towards bottom right corner of grid + starting position (0,x) ends at position (sidelength-x-1, sidelength-1) //moving towards top right corner of grid + //we do -1 because the side length starts at 1 but the indexing of the grid starts at 0. so we have to adjust + */ + + public static boolean JumpingPreChecker(int[]DY, int[]DX, int Xpos, int Ypos, int CutOffPoint) + {//Cut off point = minimum length of continuous matching values care about. + + + Boolean EnterMatrixHelper = false; + int shortersidelength; + + if(DX.length>=DY.length) + shortersidelength = DY.length; //finding max traversal length to set up interval length, aka minimum length of continious zeros to be relavent. + else + shortersidelength = DX.length; + + for(int i =0; i<(shortersidelength/CutOffPoint); i++) + { + int DistanceValue = (DX[Xpos]-DY[Ypos])*(DX[Xpos]-DY[Ypos]); + if(DistanceValue == 0) //if a match is found at any of these intervals + { + EnterMatrixHelper = true; + break; + } + Xpos = Xpos + (shortersidelength/CutOffPoint); //increment X/Y positions accordingly + Ypos = Ypos + (shortersidelength/CutOffPoint); + + if(Xpos >= (DX.length-1) || Ypos >= (DY.length-1)) //prevent exiting the indexing values + break; + } + + return EnterMatrixHelper; + + } + + public static int MatrixHelper(int[]DY, int[]DX, int Xpos, int Ypos, int traversals, int ZerocIndex) + {//the matrix helper function does one complete diagonal traversal of the grid + //It compares values from DY/DX arrays + //int XstartPos, int YstartPos are starting positions for the traversal + //traversals is a arguement that keeps the matrix aware of how far it is from the center diagonal + //ZeroxIndex is the index of the zero counter static array(for data collection) It matches the traversals counter until traversals is reset before the 2nd for loop in DisMatrixFillingDiagonally + + int SidelengthX = DX.length; //.length is the actually amount of positions allocated in memory so index 0-9 is length 10 + int SidelengthY = DY.length; + boolean test = true; + int ActiveZeroCounter = 0; + + while(test==true) + { + int DistanceValue = (DX[Xpos]-DY[Ypos])*(DX[Xpos]-DY[Ypos]); //the difference between the values squared + DistanceMatrix[Ypos][Xpos] = DistanceValue;//put value into Matrix + + if(DistanceValue == 0)//found match (maybe pick a larger range ala 2-6 to allow for puedomatch) + { + ZeroCounter[ZerocIndex] = ZeroCounter[ZerocIndex]+1;//increment index of total zeros in diagonal traversal + ActiveZeroCounter = ActiveZeroCounter+1; + } + else //found non zero value + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//if the String of zeros currently found is greater than what we have previously found for this traversal + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter); + + } + ActiveZeroCounter = 0; //reset the active counter to zero + } + + //leave loop if the array indexs being called match the final position the call reaches + if(Xpos >= (SidelengthX-1) || Ypos >= (SidelengthY-1))//-1 is because array index starts at zero but side length doesn't. + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter; + System.out.println(ActiveZeroCounter); + if(ActiveZeroCounter>0) + LZScoordinates[ZerocIndex][0] = ((Xpos+1) - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = ((Ypos+1) - ActiveZeroCounter); + //the +1 is needed because the ActiveZeroCounter could potentially give you a negative 'coordinate', + //If the zero string makes it all the way to the bottom, the (X/Ypos-activeZeroCounter) < 0. Not possible + //It isn't needed for the strings that don't end on the final position on the traversal because the +1s are happening within the while loop. + //IE. These coordinates are being calculated during the same loop as the final zero is found, but in the case above the coordinates are being calculated in the loop after the final zero is found. + } + + test = false; + } + + Xpos = Xpos+1; + Ypos = Ypos+1; + } + + return traversals; //so we can keep track of the position of the traversal + //we need to know which travesal we have just done. To know where to start the next one + } + + + public static String GridEdgeCoordinates(int[] array, int[][]Matrix) //Gets starting coordinates of traversal based on largest value in array + { + int LargestValueIndex = getIndexOfLargest(array); + String coordinates; + + + if(LargestValueIndex>=(Matrix[1].length)) + { + coordinates = new String("(" + 0 + "," + ((LargestValueIndex-(Matrix[1].length)+1) +") "+ "value: " + array[LargestValueIndex])); + //if the starting value was in the top half of the grid + //this is because the grid is made middle to bottom left corner then back to the middle line to top right corner + } + else + { + coordinates = new String("(" + LargestValueIndex + "," + 0 + ") " + "value: " + array[LargestValueIndex]); + } + + return coordinates; + } + + public static int getIndexOfLargest(int[] array) //returns index of largest value in an array + { + if(array == null || array.length == 0) + { + return -1; // null or empty + } + + int largest = 0; + + for(int i = 1; iarray[largest]) + { + largest = i; + } + } + + return largest; + } + + public static double PercentMatch(int[]Array, int[][]Matrix) + { //takes in one array, and a matrix. Finds the percent match between largest value in array and shorter sidelength of the matrix. + //We are determining how many matchs occured(or consequetively or not), in the best case, vs how many could have + int IndexOfLargest = getIndexOfLargest(Array); + double ValueTotal; + + //no given traversal is longer than the shorter side of the Matrix-thus the longest traversal is = the shorter sidelength + if(Matrix.length > Matrix[1].length) + { + ValueTotal = Matrix[1].length; + } + else + { + ValueTotal = Matrix.length; + } + + double ValueMatchs = (double) Array[IndexOfLargest]; + double PercentMatch = (ValueMatchs/ValueTotal)*100; + return PercentMatch; + } + + public static void PrintMatrix(int[][]Matrix) //prints the Distance Matrix given a Matrix + {//Print out the matrix in nice + for (int i = 0; i < Matrix.length; i++) + { + for (int j = 0; j < Matrix[i].length; j++) + { + System.out.print(Matrix[i][j] + " | "); + } + System.out.println(); + } + System.out.print("_____________________________________"); + System.out.println(); + } + + public static void DisMatrixHorizontal(int[]X,int[]Y) //standard approach to creating the distance matrix via 2 for loop. horizontal traversal not diagonal + { + int[][] DistanceMX = new int[X.length][Y.length]; + for(int i=0; i<(X.length); i++) + { + for(int t=0; tArrayY.length) +// { +// //int[] Temp = new int[ArrayX.length]; +// for(int i = 0; iArrayY.length) + // { + // int[] Temp = new int[ArrayX.length]; + // for(int i = 0; iy>0) takes Sidelength-Y operations to reach its final grid position + + ex with #s: a 7 by 7 grid's longest traversal is 7 operations long(Starting at position 0,0) + (0,1)=6op (0,2) = 5...(0,6) = 1 same with (1,0)=6op, (2,0)=5op....(6,0)=1op + longest traversal(starting at 0,0) is maximum the length of the shorter side + + + + for a sqaure the longest traversal ends at (sidelength, sidelength) (the perfect diagonal) + starting position (y,0) ends at position (sidelength-1, sidelength-y-1) //moving towards bottom right corner of grid + starting position (0,x) ends at position (sidelength-x-1, sidelength-1) //moving towards top right corner of grid + //we do -1 because the side length starts at 1 but the indexing of the grid starts at 0. so we have to adjust + */ + + + + public static int MatrixHelper(int[]D1, int[]D2, int Xpos, int Ypos, int traversals, int ZerocIndex) + {//the matrix helper function does one complete horizontal traversal of the grid + + int SidelengthX = D1.length; //.length is the actually amount of positions allocated in memory so index 0-9 is length 10 + boolean test = true; + int ActiveZeroCounter = 0; + + while(test==true) + { + //System.out.println("(Y,X)" + Ypos + "," + Xpos); + //System.out.println("traversals = " + traversals); //doesn't work in secound half of the listing unless the traversal values are reset to 1 + + int DistanceValue = (D2[Ypos]-D1[Xpos])*(D2[Ypos]-D1[Xpos]); //the difference between the values squared + DistanceMatrix[Xpos][Ypos] = DistanceValue;//put value into Matrix + + if(DistanceValue == 0)//found match (maybe pick a larger range ala 2-6 to allow for puedomatch) + { + ZeroCounter[ZerocIndex] = ZeroCounter[ZerocIndex]+1;//increment index of total zeros in diagonal traversal + + ActiveZeroCounter = ActiveZeroCounter+1; + System.out.println("Active Zero Counter: " + ActiveZeroCounter); + } + else //found non zero value + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//if the String of zeros currently found is greater than what we have previously found for this traversal + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + } + //System.out.println(ActiveZeroCounter); + ActiveZeroCounter = 0; //reset the active counter to zero + } + + //leave loop if the array indexs being called match the final position the call reaches + if(Xpos == SidelengthX-1-traversals && Ypos == SidelengthX-1)//-1 is because array index starts at zero side length doesn't. + { + //System.out.println("time to break-TR"); + //System.out.println("number of zeros:" + ZeroCounter[ZerocIndex]); + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter; + } + + test = false; + break; + } + + if(Xpos== SidelengthX-1 && Ypos== SidelengthX-1-traversals)//the ending coordinates for values below the middle traversal towards to bottom left + { + //System.out.println("time to break2-BL"); + //System.out.println("number of zeros:" + ZeroCounter[ZerocIndex]); + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + } + + test = false; + break; + } + + Xpos = Xpos+1; + Ypos = Ypos+1; + } + + return traversals; //so we can keep track of the position of the traversal + //we need to know which travesal we have just done. To know where to start the next one + } + + + public static String GridStartingCoordinates(int[] array) //Gets starting coordinates of traversal based on largest value in array + { + int LargestValueIndex = getIndexOfLargest(array); + String coordinates; + + if(LargestValueIndex>((array.length/2))) + { + coordinates = new String(("(" + 0 + "," + (LargestValueIndex-((array.length/2))) +") "+ "value: " + array[LargestValueIndex])); + //if the starting value was in the top half of the grid + //this is because the grid is made middle to bottom left corner then back to the middle line to top right corner + } + else + { + coordinates = new String("(" + LargestValueIndex + "," + 0 + ") " + "value: " + array[LargestValueIndex]); + } + return coordinates; + } + +} + + diff --git a/dedupe/dtw/Old versions MP/DistanceMatrixUpdate1.java b/dedupe/dtw/Old versions MP/DistanceMatrixUpdate1.java new file mode 100644 index 00000000..3e21a193 --- /dev/null +++ b/dedupe/dtw/Old versions MP/DistanceMatrixUpdate1.java @@ -0,0 +1,381 @@ +public class DistanceMatrixUpdate1 extends SqaureGridUpdate1 +{ + static int[] ZeroCounter; //static members of the class which can be accessed anywhere + static int[][] DistanceMatrix; + static int[] LongestZeroString; + static int[][] LZScoordinates; + + public static void main(String[] args) + { + +//QUICK TEST CASES + // {1,1,2,2,1,1};//set of values that should have different LongestZeroString and ZeroCounter + // {1,2,2,2,0,1}; + + // int[] ArrayX = new int[]{1,1,2,3,2,1,0};//set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayY = new int[]{0,1,1,2,3,2,1}; + + //{0,0,0,1,1};// + //{0,0,1,1,0};// + + + //int[] ArrayX = new int[]{1,1,1,1,2,4,3,2}; //set of values that should have different LongestZeroString and ZeroCounter + //int[] ArrayY = new int[]{1,1,1,2,4,4,3,2}; + + //{2,2,2,3,3,2,2}; //set of values that should have different LongestZeroString and ZeroCounter + //{2,2,2,2,3,2,2}; + + int[] ArrayX = new int[]{3,2,3,1,3,1,2,1}; //GOOD example of finding starting coordinates of starting positions + int[] ArrayY = new int[]{2,3,1,2,1,2,1,1}; + + // int[] ArrayX = new int[]{80,82,85,85,94,90,90,75,77,80,81}; + // int[] ArrayY = new int[]{75,77,80,82,85,85,85,89,94,90,90}; + +// int[] ArrayX = new int[]{137566,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,137566 +// ,191 +// ,1000 +// ,191 +// ,992 +// ,191 +// ,191 +// ,191 +// ,191 +// ,1000 +// ,191 +// ,1000 +// ,191 +// ,1000 +// ,1000 +// ,1000 +// ,191 +// ,191 +// ,1000 +// ,127 +// ,992 +// ,127 +// ,127 +// ,127 +// ,127 +// ,1000 +// ,127 +// ,1000 +// ,127 +// ,1000 +// ,1000 +// ,1000 +// ,127 +// ,127 +// ,1000 +// ,992 +// ,1000 +// ,1000 +// ,1000 +// ,1000 +// ,1000 +// ,1000 +// ,1000 +// ,1000 +// ,1000 +// ,1000 +// ,1000}; +// int[] ArrayY = new int[]{191, +// 127, +// 1000, +// 137, +// 992, +// 100, +// 98, +// 101, +// 97, +// 1000, +// 114, +// 1000, +// 121, +// 1000, +// 1000, +// 1000, +// 99, +// 115, +// 127, +// 191, +// 137, +// 191, +// 100, +// 98, +// 101, +// 97, +// 191, +// 114, +// 191, +// 121, +// 191, +// 191, +// 191, +// 99, +// 115, +// 127, +// 137, +// 127, +// 100, +// 98, +// 101, +// 97, +// 127, +// 114, +// 127, +// 121, +// 127, +// 127, +// 127, +// 99, +// 115, +// 137, +// 1000, +// 100, +// 98, +// 101, +// 97, +// 1000, +// 114, +// 1000, +// 121, +// 1000, +// 1000, +// 1000}; + + + int HorizontalTraversals = ArrayX.length+ArrayY.length-1; + ZeroCounter = new int[HorizontalTraversals]; //count zeros in each horizontal traversal each index is a travesal + LongestZeroString = new int[HorizontalTraversals]; //tallys the longest continious string of zeros found in any given traversal + LZScoordinates = new int[HorizontalTraversals][2];//records coordinates(in 2 column matrix) 0 index is X pos, 1 index is Y pos (seems reversed for some reason) + + + if(ArrayX.length==ArrayY.length) + { + DistanceMatrix = new int[ArrayY.length][ArrayX.length]; //row then columns (incrementing the X values chances the column) + //Columns of DisMatrixFillingHorizontally = DisMatrixCreater + + DisMatrixFillingHorizontally(ArrayY,ArrayX);//horizontally filling the Matrix, while counting zeros per traversal + PrintMatrix(DistanceMatrix); //just a basic print function + //PrintMatrix(LZScoordinates); + System.out.println("1. Array index of most zeros: " + getIndexOfLargest(ZeroCounter)); //index of most exact overlap(ala most matching values/zeros) + System.out.println("2. Array index of longest string of zeros: " + getIndexOfLargest(LongestZeroString)); + System.out.println("3. Starting coordinates of the traversal(on the grid edge) with the longest continuous string of zeros: (x,y)_" + GridStartingCoordinates(LongestZeroString)); //index of most exact overlap(ala most matching values/zeros) + System.out.println("4. Starting coordinates of the traversal(on the grid edge) with the most total zeros: (x,y)_" + GridStartingCoordinates(ZeroCounter));//converts the index of greatest traversal to the starting index(on the edge of the grid) of said traversal + //DisMatrixCreater(ArrayY,ArrayX);//standard double for loop to create the proper distance matrix- to check agaisnt by horizontal creation + + System.out.println("5. Exact X & Y grid coordinates of longest String of Zeros (x,y)_"+ "(" + LZScoordinates[getIndexOfLargest(LongestZeroString)][1] + "," + LZScoordinates[getIndexOfLargest(LongestZeroString)][0] + ")"); + System.out.println("6. Percentage Match relative to perfect match_(most total zeros)_" + PercentMatch(ZeroCounter,ArrayX) + "%"); + System.out.println("7. Percentage Match relative to perfect match_(longest continous string of zeros)_" + PercentMatch(LongestZeroString,ArrayX) + "%"); + + PrintDataStructures(); + + } + else + { + System.out.println("The arrays being compared aren't the same length-"); + } + + } + + public static void PrintDataStructures() + {//maybe make them visualize, next to each other. (maybe not practical) + // System.out.print("Index: DataStrucktures"); + // for(int x=0; xy>0) takes Sidelength-Y operations to reach its final grid position + + ex with #s: a 7 by 7 grid's longest traversal is 7 operations long(Starting at position 0,0) + (0,1)=6op (0,2) = 5...(0,6) = 1 same with (1,0)=6op, (2,0)=5op....(6,0)=1op + longest traversal(starting at 0,0) is maximum the length of the shorter side + + + + for a sqaure the longest traversal ends at (sidelength, sidelength) (the perfect diagonal) + starting position (y,0) ends at position (sidelength-1, sidelength-y-1) //moving towards bottom right corner of grid + starting position (0,x) ends at position (sidelength-x-1, sidelength-1) //moving towards top right corner of grid + //we do -1 because the side length starts at 1 but the indexing of the grid starts at 0. so we have to adjust + */ + + + + public static int MatrixHelper(int[]D1, int[]D2, int Xpos, int Ypos, int traversals, int ZerocIndex) + {//the matrix helper function does one complete horizontal traversal of the grid + + int SidelengthX = D1.length; //.length is the actually amount of positions allocated in memory so index 0-9 is length 10 + boolean test = true; + int ActiveZeroCounter = 0; + + while(test==true) + { + //System.out.println("(Y,X)" + Ypos + "," + Xpos); + //System.out.println("traversals = " + traversals); //doesn't work in secound half of the listing unless the traversal values are reset to 1 + + int DistanceValue = (D2[Ypos]-D1[Xpos])*(D2[Ypos]-D1[Xpos]); //the difference between the values squared + DistanceMatrix[Xpos][Ypos] = DistanceValue;//put value into Matrix + + if(DistanceValue == 0)//found match (maybe pick a larger range ala 2-6 to allow for puedomatch) + { + ZeroCounter[ZerocIndex] = ZeroCounter[ZerocIndex]+1;//increment index of total zeros in diagonal traversal + ActiveZeroCounter = ActiveZeroCounter+1; + // System.out.println("Active Zero Counter: " + ActiveZeroCounter); + } + else //found non zero value + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//if the String of zeros currently found is greater than what we have previously found for this traversal + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + //System.out.println("#: " + (Xpos - ActiveZeroCounter) + "," + (Ypos - ActiveZeroCounter)); + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter); + + } + System.out.println(ActiveZeroCounter); + ActiveZeroCounter = 0; //reset the active counter to zero + } + + //leave loop if the array indexs being called match the final position the call reaches + if(Xpos == SidelengthX-1-traversals && Ypos == SidelengthX-1)//-1 is because array index starts at zero side length doesn't. + { + //System.out.println("time to break-TR"); + //System.out.println("number of zeros:" + ZeroCounter[ZerocIndex]); + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter; + System.out.println("#: " + (Xpos - ActiveZeroCounter + 1) + "," + (Ypos - ActiveZeroCounter + 1)); + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter + 1); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter + 1); + + } + + test = false; + break; + } + + if(Xpos== SidelengthX-1 && Ypos== SidelengthX-1-traversals)//the ending coordinates for values below the middle traversal towards to bottom left + { + //System.out.println("time to break2-BL"); + //System.out.println("number of zeros:" + ZeroCounter[ZerocIndex]); + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + System.out.println("#: " + (Xpos - ActiveZeroCounter + 1) + "," + (Ypos - ActiveZeroCounter + 1)); + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter + 1); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter + 1); + } + + test = false; + break; + } + + Xpos = Xpos+1; + Ypos = Ypos+1; + } + + return traversals; //so we can keep track of the position of the traversal + //we need to know which travesal we have just done. To know where to start the next one + } + + + public static String GridStartingCoordinates(int[] array) //Gets starting coordinates of traversal based on largest value in array + { + int LargestValueIndex = getIndexOfLargest(array); + String coordinates; + + if(LargestValueIndex>((array.length/2))) + { + coordinates = new String(("(" + (LargestValueIndex-((array.length/2))) + "," + 0 +") "+ "value: " + array[LargestValueIndex])); + //if the starting value was in the top half of the grid + //this is because the grid is made middle to bottom left corner then back to the middle line to top right corner + } + else + { + coordinates = new String("(" + 0 + "," + LargestValueIndex + ") " + "value: " + array[LargestValueIndex]); + } + + return coordinates; + } + + public static double PercentMatch(int[]Array, int[]GridSideLength) + { //takes in two arrays. Finds the percent match between largest value in first array and max length in second array + int IndexOfLargest = getIndexOfLargest(Array); + double ValueTotal = GridSideLength.length; + double ValueMatchs = (double) Array[IndexOfLargest]; + double PercentMatch = (ValueMatchs/ValueTotal)*100; + return PercentMatch; + } + +} + + diff --git a/dedupe/dtw/Old versions MP/DistanceMatrixUpdate2.java b/dedupe/dtw/Old versions MP/DistanceMatrixUpdate2.java new file mode 100644 index 00000000..696bf2ef --- /dev/null +++ b/dedupe/dtw/Old versions MP/DistanceMatrixUpdate2.java @@ -0,0 +1,262 @@ +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.util.Scanner; + +public class DistanceMatrixUpdate2 extends SqaureGridUpdate2 +{ + static int[] ZeroCounter; //static members of the class which can be accessed anywhere + static int[][] DistanceMatrix; + static int[] LongestZeroString; + static int[][] LZScoordinates; + static int[] DataSetX; + static int[] DataSetY; + + + + public static void main(String[] args) throws FileNotFoundException + { + +//QUICK TEST CASES + // {1,1,2,2,1,1};//set of values that should have different LongestZeroString and ZeroCounter + // {1,2,2,2,0,1}; + + // int[] ArrayX = new int[]{1,1,2,3,2,1,0};//set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayY = new int[]{0,1,1,2,3,2,1}; + + //{0,0,0,1,1};// + //{0,0,1,1,0};// + + + // int[] ArrayX = new int[]{1,1,1,1,2,4,3,2}; //set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayY = new int[]{1,1,1,2,4,4,3,2}; + + //{2,2,2,3,3,2,2}; //set of values that should have different LongestZeroString and ZeroCounter + //{2,2,2,2,3,2,2}; + + // int[] ArrayX = new int[]{3,2,3,1,3,1,2,1}; //GOOD example of finding starting coordinates of starting positions + // int[] ArrayY = new int[]{2,3,1,2,1,2,1,1}; + + // int[] ArrayX = new int[]{80,82,85,85,94,90,90,75,77,80,81}; + // int[] ArrayY = new int[]{75,77,80,82,85,85,85,89,94,90,90}; + + + FileInputStream data = new FileInputStream("/Users/srapp/Desktop/data-analytics/dedupe/dtw/dataset.txt"); //X + FileInputStream data2 = new FileInputStream("/Users/srapp/Desktop/data-analytics/dedupe/dtw/dataset2.txt"); //Y + Scanner scanner = new Scanner(data); + Scanner scanner2 = new Scanner(data2); + + DataSetX = new int[11044];//I looked at the number of lines in the file + DataSetY = new int[11044]; + + int i = 0; + while(scanner.hasNextLine()) + { + + DataSetX[i] = scanner.nextInt(); + DataSetY[i] = scanner2.nextInt(); + + i++; + } + scanner.close(); + scanner2.close(); + + + + int HorizontalTraversals = DataSetX.length+DataSetY.length-1; + ZeroCounter = new int[HorizontalTraversals]; //count zeros in each horizontal traversal each index is a travesal + LongestZeroString = new int[HorizontalTraversals]; //tallys the longest continious string of zeros found in any given traversal + LZScoordinates = new int[HorizontalTraversals][2];//records coordinates(in 2 column matrix) 0 index is X pos, 1 index is Y pos (seems reversed for some reason) + + + if(DataSetX.length==DataSetY.length) + { + DistanceMatrix = new int[DataSetY.length][DataSetX.length]; //row then columns (incrementing the X values chances the column) + //Columns of DisMatrixFillingHorizontally = DisMatrixCreater + + DisMatrixFillingHorizontally(DataSetY,DataSetX); + //DisMatrixFillingHorizontally(ArrayY,ArrayX);//horizontally filling the Matrix, while counting zeros per traversal + // PrintMatrix(DistanceMatrix); //just a basic print function + // PrintMatrix(LZScoordinates); + System.out.println("RESULT SECTION:"); + System.out.println("1. Array index of most zeros: " + getIndexOfLargest(ZeroCounter)); //index of most exact overlap(ala most matching values/zeros) + System.out.println("2. Array index of longest string of zeros: " + getIndexOfLargest(LongestZeroString)); + System.out.println("3. Starting coordinates of the traversal(on the grid edge) with the longest continuous string of zeros: (x,y)_" + GridStartingCoordinates(LongestZeroString)); //index of most exact overlap(ala most matching values/zeros) + System.out.println("4. Starting coordinates of the traversal(on the grid edge) with the most total zeros: (x,y)_" + GridStartingCoordinates(ZeroCounter));//converts the index of greatest traversal to the starting index(on the edge of the grid) of said traversal + //DisMatrixCreater(ArrayY,ArrayX);//standard double for loop to create the proper distance matrix- to check agaisnt by horizontal creation + + System.out.println("5. Exact X & Y grid coordinates of longest String of Zeros (x,y)_"+ "(" + LZScoordinates[getIndexOfLargest(LongestZeroString)][1] + "," + LZScoordinates[getIndexOfLargest(LongestZeroString)][0] + ")"); + System.out.println("6. Percentage Match relative to perfect match_(most total zeros)_" + PercentMatch(ZeroCounter,DataSetX) + "%"); + System.out.println("7. Percentage Match relative to perfect match_(longest continous string of zeros)_" + PercentMatch(LongestZeroString,DataSetX) + "%"); + + } + else + { + System.out.println("The Datasets being compared aren't the same length-"); + } + + } + +//public static void ReadGoogleSlides() + + public static void DisMatrixFillingHorizontally(int[]D1, int[]D2)//int XstartPos, int YstartPos or maybe like a counter of # of traversals + {//arguemnts are the vectors/arrays of data points, the X and Y starting position within the Matrix + //Xpos = columns Ypos= rows + + int x = D1.length+D2.length-1; //# of horizontal iterations needed to go through every potential unit of the grid. Its the number of edge points we use + //13 is the answer for a 7 by 7 grid. + //its equal to the number of edge values or length + width - 1 + int XstartPos = 0; + int YstartPos = 0; + int traversalsFromMiddle = 0; + int ZeroCounter = 0; //index of ZeroCounter array- it is the same as the traversalFromMiddle untill TFM is reset after half the grid is filled + //with evenly sized arrays the X value will always be Odd so we have to account for java rounding down by adding 1 + + for(int i=0; i<((x/2)+1); i++)//goes through as many times are you need to traverse the middle traversal to the bottom left corner. + { + traversalsFromMiddle = MatrixHelper(D1, D2, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; //if the Helper returns the traversal count we can use its value even if the method is creating seperate variables + ZeroCounter++; + XstartPos = traversalsFromMiddle; + } + + traversalsFromMiddle=1; + XstartPos = 0;//give new starting edge index values + YstartPos = 1; + + for(int p=0; p<(x/2); p++) //traversals from 1 above the middle to top right corner. + { + traversalsFromMiddle = MatrixHelper(D1, D2, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; + ZeroCounter++; + YstartPos = traversalsFromMiddle; + } + } + + + /*to know when to make the recursive call A know how many grid positions must be filled before hand and count or B know the final position index you will land at + + if the length/width of the DisMX grid are equal(THE GRID IS A SQUARE) the length of the longest horizontal(corner to corner) requires sidelength amount of sqaure traversals. + The longest traversal of a grid with sidelength of X is X operations. Starting at position (0,0) + starting position (0,y) or (y,0) (where sidelength>y>0) takes Sidelength-Y operations to reach its final grid position + + ex with #s: a 7 by 7 grid's longest traversal is 7 operations long(Starting at position 0,0) + (0,1)=6op (0,2) = 5...(0,6) = 1 same with (1,0)=6op, (2,0)=5op....(6,0)=1op + longest traversal(starting at 0,0) is maximum the length of the shorter side + + + + for a sqaure the longest traversal ends at (sidelength, sidelength) (the perfect diagonal) + starting position (y,0) ends at position (sidelength-1, sidelength-y-1) //moving towards bottom right corner of grid + starting position (0,x) ends at position (sidelength-x-1, sidelength-1) //moving towards top right corner of grid + //we do -1 because the side length starts at 1 but the indexing of the grid starts at 0. so we have to adjust + */ + + + + public static int MatrixHelper(int[]D1, int[]D2, int Xpos, int Ypos, int traversals, int ZerocIndex) + {//the matrix helper function does one complete horizontal traversal of the grid + + int SidelengthX = D1.length; //.length is the actually amount of positions allocated in memory so index 0-9 is length 10 + boolean test = true; + int ActiveZeroCounter = 0; + + while(test==true) + { + //System.out.println("(Y,X)" + Ypos + "," + Xpos); + //System.out.println("traversals = " + traversals); //doesn't work in secound half of the listing unless the traversal values are reset to 1 + + int DistanceValue = (D2[Ypos]-D1[Xpos])*(D2[Ypos]-D1[Xpos]); //the difference between the values squared + DistanceMatrix[Xpos][Ypos] = DistanceValue;//put value into Matrix + + if(DistanceValue == 0)//found match (maybe pick a larger range ala 2-6 to allow for puedomatch) + { + ZeroCounter[ZerocIndex] = ZeroCounter[ZerocIndex]+1;//increment index of total zeros in diagonal traversal + ActiveZeroCounter = ActiveZeroCounter+1; + //System.out.println("Active Zero Counter: " + ActiveZeroCounter); + } + else //found non zero value + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//if the String of zeros currently found is greater than what we have previously found for this traversal + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + //System.out.println("#: " + (Xpos - ActiveZeroCounter) + "," + (Ypos - ActiveZeroCounter)); + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter +1); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter +1); + + } + //System.out.println(ActiveZeroCounter); + ActiveZeroCounter = 0; //reset the active counter to zero + } + + //leave loop if the array indexs being called match the final position the call reaches + if(Xpos == SidelengthX-1-traversals && Ypos == SidelengthX-1)//-1 is because array index starts at zero side length doesn't. + { + //System.out.println("time to break-TR"); + //System.out.println("number of zeros:" + ZeroCounter[ZerocIndex]); + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter; + // System.out.println("#: " + (Xpos - ActiveZeroCounter + 1) + "," + (Ypos - ActiveZeroCounter + 1)); + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter + 1); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter + 1); + + } + + test = false; + break; + } + + if(Xpos== SidelengthX-1 && Ypos== SidelengthX-1-traversals)//the ending coordinates for values below the middle traversal towards to bottom left + { + //System.out.println("time to break2-BL"); + //System.out.println("number of zeros:" + ZeroCounter[ZerocIndex]); + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + //System.out.println("#: " + (Xpos - ActiveZeroCounter + 1) + "," + (Ypos - ActiveZeroCounter + 1)); + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter + 1); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter + 1); + } + + test = false; + break; + } + + Xpos = Xpos+1; + Ypos = Ypos+1; + } + + return traversals; //so we can keep track of the position of the traversal + //we need to know which travesal we have just done. To know where to start the next one + } + + + public static String GridStartingCoordinates(int[] array) //Gets starting coordinates of traversal based on largest value in array + { + int LargestValueIndex = getIndexOfLargest(array); + String coordinates; + + if(LargestValueIndex>((array.length/2))) + { + coordinates = new String(("(" + (LargestValueIndex-((array.length/2))) + "," + 0 +") "+ "value: " + array[LargestValueIndex])); + //if the starting value was in the top half of the grid + //this is because the grid is made middle to bottom left corner then back to the middle line to top right corner + } + else + { + coordinates = new String("(" + 0 + "," + LargestValueIndex + ") " + "value: " + array[LargestValueIndex]); + } + + return coordinates; + } + + public static double PercentMatch(int[]Array, int[]GridSideLength) + { //takes in two arrays. Finds the percent match between largest value in first array and max length in second array + int IndexOfLargest = getIndexOfLargest(Array); + double ValueTotal = GridSideLength.length; + double ValueMatchs = (double) Array[IndexOfLargest]; + double PercentMatch = (ValueMatchs/ValueTotal)*100; + return PercentMatch; + } + +} + + diff --git a/dedupe/dtw/Old versions MP/Info b/dedupe/dtw/Old versions MP/Info new file mode 100644 index 00000000..4148a4bb --- /dev/null +++ b/dedupe/dtw/Old versions MP/Info @@ -0,0 +1 @@ +This folder contains old versions of the Distance Matrix Code. diff --git a/dedupe/dtw/Old versions MP/SqaureGrid.java b/dedupe/dtw/Old versions MP/SqaureGrid.java new file mode 100644 index 00000000..3982c345 --- /dev/null +++ b/dedupe/dtw/Old versions MP/SqaureGrid.java @@ -0,0 +1,75 @@ +public class SqaureGrid +{ + + public static int getIndexOfLargest(int[] array) //returns index of largest value in an array + { + if(array == null || array.length == 0) + { + return -1; // null or empty + } + int largest = 0; + + for(int i = 1; iarray[largest]) + { + largest = i; + } + } + + return largest; + } + + public static void MatchArrays(int[]Array1, int[]Array2) + { + if(Array1.length>Array2.length) + { + int[] Temp = new int[Array1.length]; + for(int i = 0; iarray[largest]) + { + largest = i; + } + } + + return largest; + } + + + + + public static void MatchArrays(int[]Array1, int[]Array2) + { + if(Array1.length>Array2.length) + { + int[] Temp = new int[Array1.length]; + for(int i = 0; iarray[largest]) + { + largest = i; + } + } + + return largest; + } + + + + + public static void MatchArrays(int[]Array1, int[]Array2) + { + if(Array1.length>Array2.length) + { + int[] Temp = new int[Array1.length]; + for(int i = 0; iarray[largest]) + { + largest = i; + } + } + + return largest; + } + + + + + public static void MatchArrays(int[]Array1, int[]Array2) + { + if(Array1.length>Array2.length) + { + int[] Temp = new int[Array1.length]; + for(int i = 0; iTraversalsStartingonXAxis) //the Y axis is bigger than the X axis + { + shortersidelength = TraversalsStartingonXAxis+1; + LongerSideY = true; + } + else ////the X axis is bigger than the Y axis + { + shortersidelength = TraversalsStartingonYAxis+1; + LongerSideY = false; + } + + if(LongerSideY==true) //to make sure I get a positive value when taking the difference. + { + difference = TraversalsStartingonYAxis - TraversalsStartingonXAxis; + } + else + { + difference = TraversalsStartingonXAxis - TraversalsStartingonYAxis; + } + + //int CutOffPoint = (shortersidelength/2);//setting what value with result in a corner cut off + int CutOffPoint = (shortersidelength/4);//lower cut off, less traversals removed. + + int XstartPos = 0; + int YstartPos = 0; + int traversalsFromMiddle = 0; + int ZeroCounter = 0; //index of ZeroCounter array- it is the same as the traversalFromMiddle untill TFM is reset after half the grid is filled + //with evenly sized arrays the X value will always be Odd so we have to account for java rounding down by adding 1 + + for(int i=0; i<(TraversalsStartingonXAxis+1); i++)//goes through as many times are you need to traverse the middle traversal to the bottom left corner. + { + if(LongerSideY==false) //X is the longer side, so it must be accounted for in the assumption + { + if(CutOffPoint>=(shortersidelength+difference-i))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped"); + ZeroCounter = ZeroCounter +1; + continue; + } + } + else + { + if(CutOffPoint>=(shortersidelength-i))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped:::"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + + + traversalsFromMiddle = MatrixHelper(DY, DX, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; //if the Helper returns the traversal count we can use its value even if the method is creating seperate variables + ZeroCounter++; + XstartPos = traversalsFromMiddle; + } + + traversalsFromMiddle=1; + XstartPos = 0;//give new starting edge index values + YstartPos = 1; + +//System.out.println(LongerSideY); + + for(int p=0; p=(shortersidelength-p-1))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + {//+-1 because the values start at (0,1) instead of (0,0) in the for loop above. So there are fewer loops to reach the threshhold + //System.out.println("Skipped->"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + else //Y is the longer side so we have to give it more buffer space + { + if(CutOffPoint>=(shortersidelength+difference-p-1))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped----"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + + + traversalsFromMiddle = MatrixHelper(DY, DX, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; + ZeroCounter++; + YstartPos = traversalsFromMiddle; + } + } + + + /*to know when to make the recursive call A know how many grid positions must be filled before hand and count or B know the final position index you will land at + + if the length/width of the DisMX grid are equal(THE GRID IS A SQUARE) the length of the longest horizontal(corner to corner) requires sidelength amount of sqaure traversals. + The longest traversal of a grid with sidelength of X is X operations. Starting at position (0,0) + starting position (0,y) or (y,0) (where sidelength>y>0) takes Sidelength-Y operations to reach its final grid position + + ex with #s: a 7 by 7 grid's longest traversal is 7 operations long(Starting at position 0,0) + (0,1)=6op (0,2) = 5...(0,6) = 1 same with (1,0)=6op, (2,0)=5op....(6,0)=1op + longest traversal(starting at 0,0) is maximum the length of the shorter side + + + + for a sqaure the longest traversal ends at (sidelength, sidelength) (the perfect diagonal) + starting position (y,0) ends at position (sidelength-1, sidelength-y-1) //moving towards bottom right corner of grid + starting position (0,x) ends at position (sidelength-x-1, sidelength-1) //moving towards top right corner of grid + //we do -1 because the side length starts at 1 but the indexing of the grid starts at 0. so we have to adjust + */ + + + + public static int MatrixHelper(int[]DY, int[]DX, int Xpos, int Ypos, int traversals, int ZerocIndex) + {//the matrix helper function does one complete diagonal traversal of the grid + //It compares values from DY/DX arrays + //int XstartPos, int YstartPos are starting positions for the traversal + //traversals is a arguement that keeps the matrix aware of how far it is from the center diagonal + //ZeroxIndex is the index of the zero counter static array(for data collection) It matches the traversals counter until traversals is reset before the 2nd for loop in DisMatrixFillingDiagonally + + int SidelengthX = DX.length; //.length is the actually amount of positions allocated in memory so index 0-9 is length 10 + int SidelengthY = DY.length; + boolean test = true; + int ActiveZeroCounter = 0; + + while(test==true) + { + int DistanceValue = (DX[Xpos]-DY[Ypos])*(DX[Xpos]-DY[Ypos]); //the difference between the values squared + DistanceMatrix[Ypos][Xpos] = DistanceValue;//put value into Matrix + + if(DistanceValue == 0)//found match (maybe pick a larger range ala 2-6 to allow for puedomatch) + { + ZeroCounter[ZerocIndex] = ZeroCounter[ZerocIndex]+1;//increment index of total zeros in diagonal traversal + ActiveZeroCounter = ActiveZeroCounter+1; + } + else //found non zero value + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//if the String of zeros currently found is greater than what we have previously found for this traversal + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter); + + } + ActiveZeroCounter = 0; //reset the active counter to zero + } + + //leave loop if the array indexs being called match the final position the call reaches + if(Xpos >= (SidelengthX-1) || Ypos >= (SidelengthY-1))//-1 is because array index starts at zero but side length doesn't. + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter; + System.out.println(ActiveZeroCounter); + if(ActiveZeroCounter>0) + LZScoordinates[ZerocIndex][0] = ((Xpos+1) - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = ((Ypos+1) - ActiveZeroCounter); + //the +1 is needed because the ActiveZeroCounter could potentially give you a negative 'coordinate', + //If the zero string makes it all the way to the bottom, the (X/Ypos-activeZeroCounter) < 0. Not possible + //It isn't needed for the strings that don't end on the final position on the traversal because the +1s are happening within the while loop. + //IE. These coordinates are being calculated during the same loop as the final zero is found, but in the case above the coordinates are being calculated in the loop after the final zero is found. + } + + test = false; + } + + Xpos = Xpos+1; + Ypos = Ypos+1; + } + + return traversals; //so we can keep track of the position of the traversal + //we need to know which travesal we have just done. To know where to start the next one + } + + + public static String GridEdgeCoordinates(int[] array, int[][]Matrix) //Gets starting coordinates of traversal based on largest value in array + { + int LargestValueIndex = getIndexOfLargest(array); + String coordinates; + + + if(LargestValueIndex>=(Matrix[1].length)) + { + coordinates = new String("(" + 0 + "," + ((LargestValueIndex-(Matrix[1].length)+1) +") "+ "value: " + array[LargestValueIndex])); + //if the starting value was in the top half of the grid + //this is because the grid is made middle to bottom left corner then back to the middle line to top right corner + } + else + { + coordinates = new String("(" + LargestValueIndex + "," + 0 + ") " + "value: " + array[LargestValueIndex]); + } + + return coordinates; + } + + public static double PercentMatch(int[]Array, int[][]Matrix) + { //takes in one array, and a matrix. Finds the percent match between largest value in array and shorter sidelength of the matrix. + //We are determining how many matchs occured(or consequetively or not), in the best case, vs how many could have + int IndexOfLargest = getIndexOfLargest(Array); + double ValueTotal; + + //no given traversal is longer than the shorter side of the Matrix-thus the longest traversal is = the shorter sidelength + if(Matrix.length > Matrix[1].length) + { + ValueTotal = Matrix[1].length; + } + else + { + ValueTotal = Matrix.length; + } + + double ValueMatchs = (double) Array[IndexOfLargest]; + double PercentMatch = (ValueMatchs/ValueTotal)*100; + return PercentMatch; + } + +} + + diff --git a/dedupe/dtw/Old versions MP/rectangleMatrixCutOff.class b/dedupe/dtw/Old versions MP/rectangleMatrixCutOff.class new file mode 100644 index 00000000..4d3561fd Binary files /dev/null and b/dedupe/dtw/Old versions MP/rectangleMatrixCutOff.class differ diff --git a/dedupe/dtw/Old versions MP/rectangleMatrixCutOff.java b/dedupe/dtw/Old versions MP/rectangleMatrixCutOff.java new file mode 100644 index 00000000..407c3bc8 --- /dev/null +++ b/dedupe/dtw/Old versions MP/rectangleMatrixCutOff.java @@ -0,0 +1,371 @@ +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.util.Scanner; + +public class DistanceMatrixCutOffAssumption +{ + static int[] ZeroCounter; //static members of the class which can be accessed anywhere + static int[][] DistanceMatrix; + static int[] LongestZeroString; + static int[][] LZScoordinates; + static int[] DataSetX; + static int[] DataSetY; + + + + public static void main(String[] args) throws FileNotFoundException + { + +//QUICK TEST CASES + // int[] ArrayY = new int[]{1,1,2,2,1,1};//set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayX = new int[]{1,2,2,2,0,1,1,1}; + + // int[] ArrayX = new int[]{1,1,2,3,2,1,0,1};//set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayY = new int[]{0,1,1,2,3,2,1,1,3,1,2}; + + //{0,0,0,1,1};// + //{0,0,1,1,0};// + + + int[] ArrayY = new int[]{1,2,1,1,2,4,4,3}; + int[] ArrayX = new int[]{1,1,1,2,4,4,3,2,2,3,4,5}; + + // int[] ArrayX = new int[]{2,2,3,3,2,2,2,2,2}; //set of values that should have different LongestZeroString and ZeroCounter + // int[] ArrayY = new int[]{2,2,2,3,2,2}; + + // int[] ArrayX = new int[]{3,2,3,1}; //GOOD example of finding starting coordinates of starting positions + // int[] ArrayY = new int[]{2,3,1,3,1,3,4,1}; + // int[] ArrayX = new int[]{80,82,85,85,94,90,90,75,77,80,81}; + // int[] ArrayY = new int[]{75,77,80,82,85,85,85,89,94,90,90}; + + + FileInputStream data = new FileInputStream("/Users/samuelrapp/Desktop/DM/datasetTest.txt"); //X + FileInputStream data2 = new FileInputStream("/Users/samuelrapp/Desktop/DM/dataset2Test.txt"); //Y + Scanner scanner = new Scanner(data); + Scanner scanner2 = new Scanner(data2); + + DataSetX = new int[11044];//I looked at the number of lines in the file + DataSetY = new int[11044]; + DataSetX = ArrayX; + DataSetY = ArrayY; + + // int i = 0; + // while(scanner.hasNextLine()) + // { + + // DataSetX[i] = scanner.nextInt(); + // DataSetY[i] = scanner2.nextInt(); + + // i++; + // } + // scanner.close(); + // scanner2.close(); + + + + int HorizontalTraversals = DataSetX.length+DataSetY.length-1; + ZeroCounter = new int[HorizontalTraversals]; //count total zero/matchs in each diagonal traversal + LongestZeroString = new int[HorizontalTraversals]; //tallys the longest continious string of zeros found in any given traversal + LZScoordinates = new int[HorizontalTraversals][2];//records coordinates(in 2 column matrix) 0 index is X pos, 1 index is Y pos (seems reversed for some reason) + + + + DistanceMatrix = new int[DataSetY.length][DataSetX.length]; //row then columns (incrementing the X values chances the column) + PrintMatrix(DistanceMatrix); + DisMatrixFillingDiagonally(DataSetY,DataSetX); + // PrintMatrix(DistanceMatrix); //just a basic print function + // PrintMatrix(LZScoordinates); + System.out.println("RESULT SECTION:"); + System.out.println("1. Array index of most zeros: " + getIndexOfLargest(ZeroCounter)); //index of most exact overlap(ala most matching values/zeros) + System.out.println("2. Array index of longest string of zeros: " + getIndexOfLargest(LongestZeroString)); + System.out.println("3. Starting coordinates of the traversal(on the grid edge) with the longest continuous string of zeros: (x,y)_" + GridEdgeCoordinates(LongestZeroString, DistanceMatrix)); //index of most exact overlap(ala most matching values/zeros) + System.out.println("4. Starting coordinates of the traversal(on the grid edge) with the most total zeros: (x,y)_" + GridEdgeCoordinates(ZeroCounter, DistanceMatrix));//converts the index of greatest traversal to the starting index(on the edge of the grid) of said traversal + DisMatrixHorizontal(ArrayY,ArrayX);//standard double for loop to create the proper distance matrix- to check agaisnt by horizontal creation + PrintMatrix(DistanceMatrix); + + System.out.println("5. Exact X & Y grid coordinates of longest String of Zeros (x,y)_"+ "(" + LZScoordinates[getIndexOfLargest(LongestZeroString)][0] + "," + LZScoordinates[getIndexOfLargest(LongestZeroString)][1] + ")"); + System.out.println("6. Percentage Match relative to perfect match_(most total zeros)_" + PercentMatch(ZeroCounter,DistanceMatrix) + "%"); + System.out.println("7. Percentage Match relative to perfect match_(longest continous string of zeros)_" + PercentMatch(LongestZeroString,DistanceMatrix) + "%"); + + } + +//public static void ReadGoogleSlides() + + public static void DisMatrixFillingDiagonally(int[]DY, int[]DX) + {//arguemnts are the arrays of data points + //Xpos = columns Ypos= rows + int TotalTraversals = DY.length+DX.length-1; //# of diagonal iterations needed to go through every potential unit of the grid. Its the number of edge points we use + //13 is the answer for a 7 by 7 grid. + //its equal to the number of edge values or length + width - 1 + int TraversalsStartingonYAxis = TotalTraversals-DX.length; + int TraversalsStartingonXAxis = TotalTraversals-DY.length; + + int shortersidelength; //determining the longest possible diagonal in the given matrix, ala the length of the shorter data set + boolean LongerSideY; + int difference; + + if(TraversalsStartingonYAxis>TraversalsStartingonXAxis) //the Y axis is bigger than the X axis + { + shortersidelength = TraversalsStartingonXAxis+1; + LongerSideY = true; + } + else ////the X axis is bigger than the Y axis + { + shortersidelength = TraversalsStartingonYAxis+1; + LongerSideY = false; + } + + if(LongerSideY==true) //to make sure I get a positive value when taking the difference. + { + difference = TraversalsStartingonYAxis - TraversalsStartingonXAxis; + } + else + { + difference = TraversalsStartingonXAxis - TraversalsStartingonYAxis; + } + + //int CutOffPoint = (shortersidelength/2);//setting what value with result in a corner cut off + int CutOffPoint = (shortersidelength/100);//lower cut off, less traversals removed. + + int XstartPos = 0; + int YstartPos = 0; + int traversalsFromMiddle = 0; + int ZeroCounter = 0; //index of ZeroCounter array- it is the same as the traversalFromMiddle untill TFM is reset after half the grid is filled + //with evenly sized arrays the X value will always be Odd so we have to account for java rounding down by adding 1 + + for(int i=0; i<(TraversalsStartingonXAxis+1); i++)//goes through as many times are you need to traverse the middle traversal to the bottom left corner. + { + if(LongerSideY==false) //X is the longer side, so it must be accounted for in the assumption + { + if(CutOffPoint>=(shortersidelength+difference-i))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped"); + ZeroCounter = ZeroCounter +1; + continue; + } + } + else + { + if(CutOffPoint>=(shortersidelength-i))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped:::"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + + + traversalsFromMiddle = MatrixHelper(DY, DX, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; //if the Helper returns the traversal count we can use its value even if the method is creating seperate variables + ZeroCounter++; + XstartPos = traversalsFromMiddle; + } + + traversalsFromMiddle=1; + XstartPos = 0;//give new starting edge index values + YstartPos = 1; + +//System.out.println(LongerSideY); + + for(int p=0; p=(shortersidelength-p-1))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + {//+-1 because the values start at (0,1) instead of (0,0) in the for loop above. So there are fewer loops to reach the threshhold + //System.out.println("Skipped->"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + else //Y is the longer side so we have to give it more buffer space + { + if(CutOffPoint>=(shortersidelength+difference-p-1))//this code would cut off the Matrix from filling itself once the traversal length was less than half the maximum diagonal. + { + //System.out.println("Skipped----"); + ZeroCounter = ZeroCounter + 1; + continue; + } + } + + + traversalsFromMiddle = MatrixHelper(DY, DX, XstartPos, YstartPos, traversalsFromMiddle, ZeroCounter)+1; + ZeroCounter++; + YstartPos = traversalsFromMiddle; + } + } + + + /*to know when to make the recursive call A know how many grid positions must be filled before hand and count or B know the final position index you will land at + + if the length/width of the DisMX grid are equal(THE GRID IS A SQUARE) the length of the longest horizontal(corner to corner) requires sidelength amount of sqaure traversals. + The longest traversal of a grid with sidelength of X is X operations. Starting at position (0,0) + starting position (0,y) or (y,0) (where sidelength>y>0) takes Sidelength-Y operations to reach its final grid position + + ex with #s: a 7 by 7 grid's longest traversal is 7 operations long(Starting at position 0,0) + (0,1)=6op (0,2) = 5...(0,6) = 1 same with (1,0)=6op, (2,0)=5op....(6,0)=1op + longest traversal(starting at 0,0) is maximum the length of the shorter side + + + + for a sqaure the longest traversal ends at (sidelength, sidelength) (the perfect diagonal) + starting position (y,0) ends at position (sidelength-1, sidelength-y-1) //moving towards bottom right corner of grid + starting position (0,x) ends at position (sidelength-x-1, sidelength-1) //moving towards top right corner of grid + //we do -1 because the side length starts at 1 but the indexing of the grid starts at 0. so we have to adjust + */ + + + + public static int MatrixHelper(int[]DY, int[]DX, int Xpos, int Ypos, int traversals, int ZerocIndex) + {//the matrix helper function does one complete diagonal traversal of the grid + //It compares values from DY/DX arrays + //int XstartPos, int YstartPos are starting positions for the traversal + //traversals is a arguement that keeps the matrix aware of how far it is from the center diagonal + //ZeroxIndex is the index of the zero counter static array(for data collection) It matches the traversals counter until traversals is reset before the 2nd for loop in DisMatrixFillingDiagonally + + int SidelengthX = DX.length; //.length is the actually amount of positions allocated in memory so index 0-9 is length 10 + int SidelengthY = DY.length; + boolean test = true; + int ActiveZeroCounter = 0; + + while(test==true) + { + int DistanceValue = (DX[Xpos]-DY[Ypos])*(DX[Xpos]-DY[Ypos]); //the difference between the values squared + DistanceMatrix[Ypos][Xpos] = DistanceValue;//put value into Matrix + + if(DistanceValue == 0)//found match (maybe pick a larger range ala 2-6 to allow for puedomatch) + { + ZeroCounter[ZerocIndex] = ZeroCounter[ZerocIndex]+1;//increment index of total zeros in diagonal traversal + ActiveZeroCounter = ActiveZeroCounter+1; + } + else //found non zero value + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//if the String of zeros currently found is greater than what we have previously found for this traversal + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter); + + } + ActiveZeroCounter = 0; //reset the active counter to zero + } + + //leave loop if the array indexs being called match the final position the call reaches + if(Xpos >= (SidelengthX-1) || Ypos >= (SidelengthY-1))//-1 is because array index starts at zero but side length doesn't. + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter; + System.out.println(ActiveZeroCounter); + if(ActiveZeroCounter>0) + LZScoordinates[ZerocIndex][0] = ((Xpos+1) - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = ((Ypos+1) - ActiveZeroCounter); + //the +1 is needed because the ActiveZeroCounter could potentially give you a negative 'coordinate', + //If the zero string makes it all the way to the bottom, the (X/Ypos-activeZeroCounter) < 0. Not possible + //It isn't needed for the strings that don't end on the final position on the traversal because the +1s are happening within the while loop. + //IE. These coordinates are being calculated during the same loop as the final zero is found, but in the case above the coordinates are being calculated in the loop after the final zero is found. + } + + test = false; + } + + Xpos = Xpos+1; + Ypos = Ypos+1; + } + + return traversals; //so we can keep track of the position of the traversal + //we need to know which travesal we have just done. To know where to start the next one + } + + + public static String GridEdgeCoordinates(int[] array, int[][]Matrix) //Gets starting coordinates of traversal based on largest value in array + { + int LargestValueIndex = getIndexOfLargest(array); + String coordinates; + + + if(LargestValueIndex>=(Matrix[1].length)) + { + coordinates = new String("(" + 0 + "," + ((LargestValueIndex-(Matrix[1].length)+1) +") "+ "value: " + array[LargestValueIndex])); + //if the starting value was in the top half of the grid + //this is because the grid is made middle to bottom left corner then back to the middle line to top right corner + } + else + { + coordinates = new String("(" + LargestValueIndex + "," + 0 + ") " + "value: " + array[LargestValueIndex]); + } + + return coordinates; + } + + public static int getIndexOfLargest(int[] array) //returns index of largest value in an array + { + if(array == null || array.length == 0) + { + return -1; // null or empty + } + + int largest = 0; + + for(int i = 1; iarray[largest]) + { + largest = i; + } + } + + return largest; + } + + public static double PercentMatch(int[]Array, int[][]Matrix) + { //takes in one array, and a matrix. Finds the percent match between largest value in array and shorter sidelength of the matrix. + //We are determining how many matchs occured(or consequetively or not), in the best case, vs how many could have + int IndexOfLargest = getIndexOfLargest(Array); + double ValueTotal; + + //no given traversal is longer than the shorter side of the Matrix-thus the longest traversal is = the shorter sidelength + if(Matrix.length > Matrix[1].length) + { + ValueTotal = Matrix[1].length; + } + else + { + ValueTotal = Matrix.length; + } + + double ValueMatchs = (double) Array[IndexOfLargest]; + double PercentMatch = (ValueMatchs/ValueTotal)*100; + return PercentMatch; + } + + public static void PrintMatrix(int[][]Matrix) //prints the Distance Matrix given a Matrix + {//Print out the matrix in nice + for (int i = 0; i < Matrix.length; i++) + { + for (int j = 0; j < Matrix[i].length; j++) + { + System.out.print(Matrix[i][j] + " | "); + } + System.out.println(); + } + System.out.print("_____________________________________"); + System.out.println(); + } + + public static void DisMatrixHorizontal(int[]X,int[]Y) //standard approach to creating the distance matrix via 2 for loop. horizontal traversal not diagonal + { + int[][] DistanceMX = new int[X.length][Y.length]; + for(int i=0; i<(X.length); i++) + { + for(int t=0; ty>0) takes Sidelength-Y operations to reach its final grid position + + ex with #s: a 7 by 7 grid's longest traversal is 7 operations long(Starting at position 0,0) + (0,1)=6op (0,2) = 5...(0,6) = 1 same with (1,0)=6op, (2,0)=5op....(6,0)=1op + longest traversal(starting at 0,0) is maximum the length of the shorter side + + + + for a sqaure the longest traversal ends at (sidelength, sidelength) (the perfect diagonal) + starting position (y,0) ends at position (sidelength-1, sidelength-y-1) //moving towards bottom right corner of grid + starting position (0,x) ends at position (sidelength-x-1, sidelength-1) //moving towards top right corner of grid + //we do -1 because the side length starts at 1 but the indexing of the grid starts at 0. so we have to adjust + */ + + + + public static int MatrixHelper(int[]DY, int[]DX, int Xpos, int Ypos, int traversals, int ZerocIndex) + {//the matrix helper function does one complete diagonal traversal of the grid + //It compares values from DY/DX arrays + //int XstartPos, int YstartPos are starting positions for the traversal + //traversals is a arguement that keeps the matrix aware of how far it is from the center diagonal + //ZeroxIndex is the index of the zero counter static array(for data collection) It matches the traversals counter until traversals is reset before the 2nd for loop in DisMatrixFillingHorizontally + + int SidelengthX = DX.length; //.length is the actually amount of positions allocated in memory so index 0-9 is length 10 + int SidelengthY = DY.length; + boolean test = true; + int ActiveZeroCounter = 0; + + while(test==true) + { + int DistanceValue = (DX[Xpos]-DY[Ypos])*(DX[Xpos]-DY[Ypos]); //the difference between the values squared + DistanceMatrix[Ypos][Xpos] = DistanceValue;//put value into Matrix + + if(DistanceValue == 0)//found match (maybe pick a larger range ala 2-6 to allow for puedomatch) + { + ZeroCounter[ZerocIndex] = ZeroCounter[ZerocIndex]+1;//increment index of total zeros in diagonal traversal + ActiveZeroCounter = ActiveZeroCounter+1; + } + // + else //found non zero value + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//if the String of zeros currently found is greater than what we have previously found for this traversal + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter;//change the value of longest Active Zero counter to the active zero counter + LZScoordinates[ZerocIndex][0] = (Xpos - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = (Ypos - ActiveZeroCounter); + + } + ActiveZeroCounter = 0; //reset the active counter to zero + } + + //leave loop if the array indexs being called match the final position the call reaches + if(Xpos >= (SidelengthX-1) || Ypos >= (SidelengthY-1))//-1 is because array index starts at zero but side length doesn't. + { + if(ActiveZeroCounter > LongestZeroString[ZerocIndex])//Need to check in senario when the traversal ends in a match/zero + { + LongestZeroString[ZerocIndex] = ActiveZeroCounter; + System.out.println(ActiveZeroCounter); + if(ActiveZeroCounter>0) + LZScoordinates[ZerocIndex][0] = ((Xpos+1) - ActiveZeroCounter); + LZScoordinates[ZerocIndex][1] = ((Ypos+1) - ActiveZeroCounter); + //the +1 is needed because the ActiveZeroCounter could potentially give you a negative 'coordinate', + //If the zero string makes it all the way to the bottom, the (X/Ypos-activeZeroCounter) < 0. Not possible + //It isn't needed for the strings that don't end on the final position on the traversal because the +1s are happening within the while loop. + //IE. These coordinates are being calculated during the same loop as the final zero is found, but in the case above the coordinates are being calculated in the loop after the final zero is found. + } + + test = false; + } + + Xpos = Xpos+1; + Ypos = Ypos+1; + } + + return traversals; //so we can keep track of the position of the traversal + //we need to know which travesal we have just done. To know where to start the next one + } + + + public static String GridEdgeCoordinates(int[] array, int[][]Matrix) //Gets starting coordinates of traversal based on largest value in array + { + int LargestValueIndex = getIndexOfLargest(array); + String coordinates; + + + if(LargestValueIndex>=(Matrix[1].length)) + { + coordinates = new String("(" + 0 + "," + ((LargestValueIndex-(Matrix[1].length)+1) +") "+ "value: " + array[LargestValueIndex])); + //if the starting value was in the top half of the grid + //this is because the grid is made middle to bottom left corner then back to the middle line to top right corner + } + else + { + coordinates = new String("(" + LargestValueIndex + "," + 0 + ") " + "value: " + array[LargestValueIndex]); + } + + return coordinates; + } + + public static double PercentMatch(int[]Array, int[][]Matrix) + { //takes in one array, and a matrix. Finds the percent match between largest value in array and shorter sidelength of the matrix. + //We are determining how many matchs occured(or consequetively or not), in the best case, vs how many could have + int IndexOfLargest = getIndexOfLargest(Array); + double ValueTotal; + + //no given traversal is longer than the shorter side of the Matrix-thus the longest traversal is = the shorter sidelength + if(Matrix.length > Matrix[1].length) + { + ValueTotal = Matrix[1].length; + } + else + { + ValueTotal = Matrix.length; + } + + double ValueMatchs = (double) Array[IndexOfLargest]; + double PercentMatch = (ValueMatchs/ValueTotal)*100; + return PercentMatch; + } + +} + + diff --git a/dedupe/mass/FindNN.cpp b/dedupe/mass/FindNN.cpp new file mode 100644 index 00000000..d2bd5992 --- /dev/null +++ b/dedupe/mass/FindNN.cpp @@ -0,0 +1,205 @@ +#include +#include +#include +#include + + + +double * multiply(double * x, int n , double * y , int m , double * z); +double * zNorm(double * x, int n, double * y); +double * findNN(double * x, double * y, int n, int m, double * dist); + + + +int main(int argc, char* argv[]) +{ + //Assume n > m + + int n = atol(argv[3]); + int m = atol(argv[4]); + double *x, *y, *dist; + + //Memory Allocation + + FILE * fp ; errno_t err = fopen_s(&fp,argv[1],"r"); + if( err ) + printf_s( "The file fscanf.out was not opened\n" ); + FILE * fp1 ; err = fopen_s(&fp1,argv[2],"r"); + if( err ) + printf_s( "The file fscanf.out was not opened\n"); + + x = (double *)malloc(sizeof(double) * n); + y = (double *)malloc(sizeof(double) * m); + dist = (double *)malloc(sizeof(double) * n); + + //Data Input + for ( int i = 0 ; i < n ; i ++ ) + { + double d; + fscanf_s(fp,"%lf",&d); + x[i] = d; + if( i < m ) + { + fscanf_s(fp1,"%lf",&d); + y[i] = d; + } + } + + dist = findNN(x,y,n,m,dist); + + double minm = 99999999999999.000222; + int mini = 0; + for ( int i = 0 ; i < n-m+1 ; i++ ) + if( dist[i] < minm ) + { minm = dist[i]; mini = i; } + + printf("Nearest Neighbor Distance is %lf\nNearest Neighbor location is %d (starting at 1)\n",minm, mini); + fclose(fp); fclose(fp1); + + + + + free(x); free(y); free(dist); + + system("PAUSE"); +} + + +double * findNN(double * x, double * y, int n, int m, double * dist) +{ + + //Assume n > m + double *z ; + double *cx, *cx2, *cy, *cy2; + + //Allocation + cx = (double *)malloc(sizeof(double) * (n+1)); + cx2 = (double *)malloc(sizeof(double) * (n+1)); + cy = (double *)malloc(sizeof(double) * (m+1)); + cy2 = (double *)malloc(sizeof(double) * (m+1)); + + //Normalize + x = zNorm(x,n,x); + y = zNorm(y,m,y); + + //Compute the cumulative sums + cx[0] = cx2[0] = cy[0] = cy2[0] = 0.0; + for( int i = 1 ; i <= n; i++ ) + { + cx[i] = cx[i-1]+x[i-1]; + cx2[i] = cx2[i-1]+x[i-1]*x[i-1]; + if( i <= m ) + { + cy[i] = cy[i-1]+y[i-1]; + cy2[i] = cy2[i-1]+y[i-1]*y[i-1]; + + } + + } + + //Compute the multiplication numbers + z = (double *)malloc(sizeof(double)*2*n); + z = multiply(x,n,y,m,z); + + //y Stats + + double sumy = cy[m]; + double sumy2 = cy2[m]; + double meany = sumy/m; + double sigmay = (sumy2/m)-meany*meany; + sigmay = sqrt(sigmay); + + + //The Search + for( int j = 0 ; j < n-m+1 ; j=j+1 ) + { + double sumxy = z[m-1+j]; + + double sumx = cx[j+m]-cx[j]; + double sumx2 = cx2[j+m]-cx2[j]; + double meanx = sumx/m; + double sigmax = (sumx2/m)-meanx*meanx; + sigmax = sqrt(sigmax); + + double c = ( sumxy - m*meanx*meany ) / ( m*sigmax*sigmay ); + dist[j] = sqrt(2*m*(1-c)); + + } + + free(cx); free(cx2); free(cy); free(cy2); + free(z); + return dist; +} + + +double * zNorm(double * x, int n, double * y) +{ + double ex = 0, ex2 = 0; + for(int i = 0 ; i < n ; i++ ) + { + ex += x[i]; + ex2 += x[i]*x[i]; + } + double mean = ex/n; + double std = ex2/n; + std = sqrt(std-mean*mean); + for(int i = 0 ; i < n ; i++ ) + y[i] = (x[i]-mean)/std; + return y; +} + +double * multiply(double * x, int n , double * y , int m , double * z) +{ + fftw_complex * X, * Y, * Z , *XX, *YY, *ZZ; + fftw_plan p; + + //assuming n > m + X = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * 2 * n); + Y = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * 2 * n); + XX = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * 2 * n); + YY = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * 2 * n); + Z = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * 2 * n); + ZZ = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * 2 * n); + + + for(int i = 0 ; i < 2*n ; i++ ) + { + X[i][1] = 0; Y[i][1] = 0; //iaginary part is always zero + if(i < n ) + X[i][0] = x[i]; + else + X[i][0] = 0; + + if(i < m ) + Y[i][0] = y[m-i-1]; //reversing y + else + Y[i][0] = 0; + } + + + p = fftw_plan_dft_1d(2 * n, X, XX, FFTW_FORWARD, FFTW_ESTIMATE); + fftw_execute(p); + + p = fftw_plan_dft_1d(2 * n, Y, YY, FFTW_FORWARD, FFTW_ESTIMATE); + fftw_execute(p); + + for(int i = 0 ; i < 2*n; i++) + { + ZZ[i][0] = XX[i][0]*YY[i][0] - XX[i][1]*YY[i][1]; + ZZ[i][1] = XX[i][1]*YY[i][0] + XX[i][0]*YY[i][1]; + } + + p = fftw_plan_dft_1d(2 * n, ZZ , Z , FFTW_BACKWARD, FFTW_ESTIMATE); + fftw_execute(p); + + + for(int i = 0; i < 2*n; i++ ) + z[i] = Z[i][0]/(2*n); + + fftw_destroy_plan(p); + fftw_free(X); fftw_free(Y); + fftw_free(XX); fftw_free(YY); + fftw_free(Z); fftw_free(ZZ); + + return z; +} \ No newline at end of file diff --git a/dedupe/mass/Mass.py b/dedupe/mass/Mass.py new file mode 100644 index 00000000..cda715a9 --- /dev/null +++ b/dedupe/mass/Mass.py @@ -0,0 +1,49 @@ +import numpy as np +from scipy import stats +import pandas as pd +import math + +def movstd(a,window): + left= window[0] + right = window[1] + result = [] + for i in range(len(a)): + r=0 + if i >= left and i +right= len(a): + r= np.std(a[i - left:],ddof=1) + else: + r=np.std(a[:],ddof=1) + result.append(0 if math.isnan(r) else r) + return result + + +def findInT(query,target): + m = len(query); + n = len(target); + Q = stats.zscore(query,ddof=1)#zNorm ??? + stdv = movstd(target,(0,m-1)) + Q= np.append(Q[::-1],np.zeros(n-m)) + dots =np.convolve(target,Q) + dist =2 * (m - (dots[m-1:n])/ stdv[0:n - m +1]) + return np.sqrt(dist) + +qry = np.array([1, 10, 5]); +tgt = [4,8,6,-1,-2,-3,-1,3,4,5]; +output=findInT(qry,tgt) +print output +#disp(Stdv) 2.0000 2.0000 1.0000 18.7705 21.0000 4.2426 0 +#Q = Q(end:-1:1); % Reverse +#the query +#disp(Q); +#Q(m + 1:n) = 0; % pad +#zeros +#disp(Q); +#dots = conv(T, Q); +#disp(X(m:n)) +#dist = 2 * (m - (dots(m:n))./ Stdv(1:n - m + 1)); +#dist = sqrt(dist); +