001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.ml.clustering.kdtree; 031 032import java.util.ArrayList; 033import java.util.HashSet; 034import java.util.List; 035import java.util.Set; 036 037import org.apache.log4j.Logger; 038import org.openimaj.ml.clustering.kdtree.ClusterTestDataLoader.TestStats; 039 040/** 041 * Load clusters from http://people.cs.nctu.edu.tw/~rsliang/dbscan/testdatagen.html 042 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 043 * 044 */ 045public class ClusterTestDataLoader{ 046 /** 047 * Test details 048 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 049 * 050 */ 051 public static class TestStats{ 052 /** 053 * EPS variable 054 */ 055 public double eps; 056 /** 057 * minpts variable 058 */ 059 public int minpts; 060 /** 061 * nclusters variable 062 */ 063 public int ncluster; 064 /** 065 * noutliers variable 066 */ 067 public int noutliers; 068 /** 069 * mineps variable 070 */ 071 public double mineps; 072 } 073 private int percluster = -1; 074 private boolean outliers = true; 075 076 077 /** 078 * 079 */ 080 public ClusterTestDataLoader() { 081 this.percluster = -1; 082 } 083 084 /** 085 * @param percluster 086 * @param outliers 087 * 088 */ 089 public ClusterTestDataLoader(int percluster, boolean outliers) { 090 this.percluster = percluster; 091 this.outliers = outliers; 092 } 093 094 private Logger logger = Logger.getLogger(ClusterTestDataLoader.class); 095 private TestStats testStats; 096 private int[][] testClusters; 097 private double[][] testData; 098 /** 099 * @param data 100 * @return read {@link TestStats} 101 */ 102 private TestStats readTestStats(String[] data) { 103 ClusterTestDataLoader.TestStats ret = new TestStats(); 104 int i = 0; 105 ret.eps = Double.parseDouble(data[i++].split("=")[1].trim()); 106 ret.minpts = Integer.parseInt(data[i++].split("=")[1].trim()); 107 ret.ncluster = Integer.parseInt(data[i++].split("=")[1].trim()); 108 ret.noutliers = Integer.parseInt(data[i++].split("=")[1].trim()); 109 ret.mineps = Double.parseDouble(data[i++].split("=")[1].trim()); 110 return ret; 111 } 112 113 114 /** 115 * @param data 116 * @return read the correct clusters 117 */ 118 private int[][] readTestClusters(String[] data) { 119 int i = 0; 120 for (;data[i].length()!=0; i++); 121 for (i=i+1;data[i].length()!=0; i++); 122 List<int[]> clusters = new ArrayList<int[]>(); 123 int count = 0; 124 for (i=i+1;i<data.length; i++){ 125 int[] readIntDataLine = readIntDataLine(data[i]); 126 clusters.add(readIntDataLine); 127 count += readIntDataLine.length; 128 } 129 logger .debug(String.format("Loading %d items in %d clusters\n",count,clusters.size())); 130 return clusters.toArray(new int[clusters.size()][]); 131 } 132 133 134 /** 135 * @param string 136 * @return read 137 */ 138 public int[] readIntDataLine(String string) { 139 String[] split = string.split(","); 140 int[] arr = new int[split.length-1]; 141 int i = 0; 142 143 for (String s : split) { 144 if(s.contains("<"))continue; // skip the first, it is the cluster index 145 s = s.replace(">", "").trim(); 146 arr[i++] = Integer.parseInt(s)-1; 147 148 } 149 return arr; 150 } 151 /** 152 * @param data 153 * @return read the test data 154 */ 155 private double[][] readTestData(String[] data) { 156 157 int i = 0; 158 for (;data[i].length()!=0; i++); 159 List<double[]> dataL = new ArrayList<double[]>(); 160 int start = i+1; 161 for (i=start;data[i].length()!=0; i++){ 162 dataL.add(readDataLine(data[i])); 163 } 164 logger.debug(String.format("Loading %d data items\n",dataL.size())); 165 return dataL.toArray(new double[dataL.size()][]); 166 } 167 private Set<Integer> existing(int[][] correct) { 168 Set<Integer> exist = new HashSet<Integer>(); 169 for (int[] is : correct) { 170 for (int i : is) { 171 exist.add(i); 172 } 173 } 174 return exist; 175 } 176 177 private double[] readDataLine(String string) { 178 String[] split = string.split(" "); 179 double[] arr = new double[]{ 180 Double.parseDouble(split[1]), 181 Double.parseDouble(split[2]) 182 }; 183 return arr; 184 } 185 186 public void prepare(String[] data) { 187 this.testStats = this.readTestStats(data); 188 this.testClusters = this.readTestClusters(data); 189 this.testData = this.readTestData(data); 190 correctClusters(); 191 } 192 193 private void correctClusters() { 194 195 if(this.percluster != -1){ 196 double[][] correctedData = null; 197 int[][] correctedClusters = new int[this.testClusters.length][this.percluster]; 198 int seen ; 199 if(this.outliers){ 200 seen = this.testStats.noutliers; 201 correctedData= new double[this.percluster * this.testClusters.length + seen][]; 202 for (int i = 0; i < seen; i++) { 203 correctedData[i] = this.testData[i]; 204 } 205 206 } 207 else{ 208 seen = 0; 209 correctedData = new double[this.percluster * this.testClusters.length][]; 210 } 211 for (int i = 0; i < this.testClusters.length; i++) { 212 int[] clust = this.testClusters[i]; 213 for (int j = 0; j < this.percluster; j++) { 214 int d = clust[j]; 215 correctedData[seen] = this.testData[d]; 216 correctedClusters[i][j] = seen; 217 seen++; 218 } 219 } 220 221 this.testClusters = correctedClusters; 222 this.testData = correctedData; 223 } 224 } 225 226 public TestStats getTestStats() { 227 return this.testStats; 228 } 229 230 public double[][] getTestData() { 231 return this.testData; 232 } 233 234 public int[][] getTestClusters() { 235 return this.testClusters; 236 } 237}