|
|
@@ -0,0 +1,120 @@
|
|
|
+package csv.operations;
|
|
|
+
|
|
|
+import java.io.File;
|
|
|
+import java.io.FileNotFoundException;
|
|
|
+import java.io.FileReader;
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.List;
|
|
|
+
|
|
|
+import org.apache.commons.csv.CSVFormat;
|
|
|
+import org.apache.commons.csv.CSVParser;
|
|
|
+import org.apache.commons.csv.CSVRecord;
|
|
|
+
|
|
|
+import util.Operation;
|
|
|
+import util.Pair;
|
|
|
+import util.Prefixes;
|
|
|
+
|
|
|
+public class OutlierDetectionOperation implements Operation {
|
|
|
+
|
|
|
+ private int column;
|
|
|
+
|
|
|
+ private File csv;
|
|
|
+
|
|
|
+ private String artifactPrefix;
|
|
|
+
|
|
|
+ private String instanceName;
|
|
|
+
|
|
|
+ public OutlierDetectionOperation(int column, String prefix, String instanceName) {
|
|
|
+ this.column = column;
|
|
|
+ this.artifactPrefix = prefix;
|
|
|
+ this.instanceName = instanceName;
|
|
|
+ }
|
|
|
+
|
|
|
+ public OutlierDetectionOperation(int column, String prefix, String instanceName, File csv) {
|
|
|
+ this.column = column;
|
|
|
+ this.csv = csv;
|
|
|
+ this.artifactPrefix = prefix;
|
|
|
+ this.instanceName = instanceName;
|
|
|
+ }
|
|
|
+
|
|
|
+ public String execute() throws FileNotFoundException, IOException {
|
|
|
+ List<CSVRecord> list = readCSV();
|
|
|
+ Pair<Double,Double> sdAndAvg = calculateStandardDeviation(list);
|
|
|
+ return searchOutliers(list,sdAndAvg.getFst(), sdAndAvg.getSnd());
|
|
|
+ }
|
|
|
+
|
|
|
+ private Pair<Double,Double> calculateStandardDeviation(List<CSVRecord> list) throws IOException {
|
|
|
+ double sum = 0;
|
|
|
+ int i =1;
|
|
|
+ double sumbefore = 0;
|
|
|
+ if (list.size() > 1) {
|
|
|
+ list.remove(0);
|
|
|
+ try {
|
|
|
+ for (CSVRecord row : list) {
|
|
|
+ sumbefore = sum;
|
|
|
+ sum += Double.parseDouble(row.get(column));
|
|
|
+// Used the code below to check if overflow could be happening
|
|
|
+// if ((sumbefore < 0 && sum > 0) || (sumbefore > 0 && sum < 0)) {
|
|
|
+// System.out.println("before = " + sumbefore + " / sum = " + sum);
|
|
|
+//
|
|
|
+// }
|
|
|
+// i++;
|
|
|
+ }
|
|
|
+ } catch (ArrayIndexOutOfBoundsException e) {
|
|
|
+ throw new IOException("Your CSV file may not be well-formed. Check if all rows have the same number of columns.");
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ double average = sum/list.size();
|
|
|
+
|
|
|
+ // calculate the standard deviation
|
|
|
+ double sd = 0.0;
|
|
|
+ for (CSVRecord row : list) {
|
|
|
+ double value = Double.parseDouble(row.get(column));
|
|
|
+ sd += Math.pow(value - average, 2);
|
|
|
+ }
|
|
|
+
|
|
|
+ sd = Math.sqrt(sd / (list.size()));
|
|
|
+
|
|
|
+ return new Pair<Double,Double>(sd,average);
|
|
|
+ }
|
|
|
+
|
|
|
+ private String searchOutliers(List<CSVRecord> list, double sd, double average) {
|
|
|
+ StringBuilder sb = new StringBuilder();
|
|
|
+ int i = 1;
|
|
|
+ int rowN = 1;
|
|
|
+ for (CSVRecord row : list) {
|
|
|
+ double value = Double.parseDouble(row.get(column));
|
|
|
+ if (((value - average)/sd) >= 2 ) {
|
|
|
+ sb.append("<" + artifactPrefix + instanceName + "-cell-" + rowN + "" + column + ">\n");
|
|
|
+ sb.append("\trdf:type "+ Prefixes.CELL_VOCABULARY_IRI + " , ");
|
|
|
+ sb.append(Prefixes.DATA_VOCABULARY_IRI + " , owl:Thing ;\n");
|
|
|
+ sb.append("\t"+Prefixes.TABULAR_VOCABULARY_IRI+ "hasColumnPosition " + column + " ;\n");
|
|
|
+ sb.append("\t"+Prefixes.TABULAR_VOCABULARY_IRI+ "hasRowPosition " + rowN + " ;\n");
|
|
|
+ sb.append("\t"+Prefixes.TABULAR_VOCABULARY_IRI+ "holdsContent \"" + value + "\" ;\n");
|
|
|
+ sb.append("\t"+Prefixes.TABULAR_VOCABULARY_IRI+ "isCellOfTabularData <" + artifactPrefix + instanceName + "> ;\n");
|
|
|
+ sb.append("\t"+Prefixes.TABULAR_VOCABULARY_IRI+ "isInCollection <" + artifactPrefix + instanceName + "-column-" + column + "> , ");
|
|
|
+ sb.append("<" + artifactPrefix + instanceName + "-row-" + rowN + "> ;\n");
|
|
|
+ sb.append("\towl:sameAs <" + artifactPrefix + instanceName + "-cell-" + rowN + "" + column + "> .\n\n");
|
|
|
+ //sb.append("Outlier " + i++ + " = " + value + " / in row = " + rowN + "\n");
|
|
|
+ }
|
|
|
+ rowN++;
|
|
|
+ }
|
|
|
+ return sb.toString();
|
|
|
+ }
|
|
|
+
|
|
|
+ private List<CSVRecord> readCSV() throws FileNotFoundException, IOException {
|
|
|
+ CSVParser parser = new CSVParser(new FileReader(csv), CSVFormat.DEFAULT);
|
|
|
+ return parser.getRecords();
|
|
|
+ }
|
|
|
+
|
|
|
+ public void setFile(File csv) {
|
|
|
+ this.csv = csv;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+}
|