Apache spark to convert a sequence file into tabular format

Typically when you do first pass of flattening of hierarchal or nested file format like json, XML, HD5 you get a format like <colName><ColVal1,ColVal2,ColVal3,…><ts1,ts2,ts3,…>

For further analysis in spark dataframe/dataset you need to bring these values in tabular format. Here is apache spark code to do that with java api.

import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.logging.Logger;


import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.SaveMode;


import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.api.java.function.*;




public class Hd5toparquet {


	private static final Logger logger = Logger.getLogger(Hd5toparquet.class.getName());
	// private static List<String> columnValue = new ArrayList<String> ();
	// private static String columnName =  "";
	
	public static Dataset<Row> getDataFrame(SparkSession sqlContext, String format, String inputPath, String header) {


		Dataset<Row> dataFrame = null;
		String localheader = "false";
		if (!header.equalsIgnoreCase("true")) {
			localheader = "false";
		} else {
			localheader = "true";
		}


		try {
			if (format.equals("csv")) {
				dataFrame = sqlContext.read().format("com.databricks.spark.csv").option("header", localheader)
						.option("inferSchema", "true").load(inputPath);
			} else if (format.equals("parquet")) {
				dataFrame = sqlContext.read().format("parquet").option("inferSchema", "true").load(inputPath);
			} else if (format.equals("tab")) {
				dataFrame = sqlContext.read().format("com.databricks.spark.csv").option("header", localheader)
						.option("inferSchema", "true").option("delimiter", "\\t").load(inputPath);
			} else {
				dataFrame = sqlContext.read().format("com.databricks.spark.csv").option("header", localheader)
						.option("inferSchema", "true").option("delimiter", format).load(inputPath);
			}
			return dataFrame;


		} catch (Exception e) {
			logger.severe(("Exception in get Dataframe util - " + e.getMessage()));
		}
		return null;
	}
	


	public static Dataset<Row> getSelectedColumns(Dataset<Row> df, List<String> selectedColumns) {


		// scala.collection.Seq seqCols =
		// scala.collection.JavaConverters.asScalaIteratorConverter(selectedColumns.subList(1,
		// selectedColumns.size()-1).iterator()).asScala().toSeq();
		// scala.collection.Seq<String> seqCols =
		// scala.collection.JavaConverters.asScalaIteratorConverter(selectedColumns.iterator()).asScala().toSeq();


		List<String> dropCols = new ArrayList<>();
		if (!selectedColumns.isEmpty()) {
			for (String s : df.columns()) {
				if (!selectedColumns.contains(s))
					dropCols.add(s);
			}


			// logger.severe("--------BEFORE---------");
			// df.printSchema();


			for (String col : dropCols) {
				df = df.drop(col);
			}
		}


		// logger.severe("--------AFTER---------");
		// df.printSchema();


		return df;


	}


	public static void writeDataframe(Dataset<Row> df, String outputFormat, String outputPath,
			List<String> selectedColumns, String savemode) {


		writeDataFrame(getSelectedColumns(df, selectedColumns), outputFormat, outputPath,savemode);


	}


	public static void writeDataFrame(Dataset<Row> df, String outputFormat, String outputPath, String savemode) {


		String localheader = "true";
		SaveMode sm = SaveMode.Overwrite;
		if ( savemode.equalsIgnoreCase("append") == true)
			sm = SaveMode.Append;


		if (outputFormat.equalsIgnoreCase("csv") || outputFormat.equalsIgnoreCase("parquet")) {


			df.write().mode(sm).option("header", localheader).format(outputFormat).save(outputPath);
		} else if (outputFormat.equalsIgnoreCase("tab")) {
			df.write().mode(sm).format("com.databricks.spark.csv").option("header", localheader)
					.option("delimiter", "	").save(outputPath);
		} else {
			df.write().mode(sm).format("com.databricks.spark.csv").option("header", localheader)
					.option("delimiter", outputFormat).save(outputPath);
		}
	}
	
	public static JavaPairRDD<String,Row> dfToPairRDD(String keyColumn,Dataset<Row> df){
		
		return df.toJavaRDD().keyBy(row -> row.getAs(keyColumn).toString());
		
	}
	
	public  static class FlattenForEachFunc implements ForeachFunction<Row> {


		/**
		 * 
		 */
		private static final long serialVersionUID = 1L;


		private static SparkSession ss = null;
		private List<Row> rows = new ArrayList<Row> ();
		
		public FlattenForEachFunc(SparkSession spark) {
			ss = spark;
		}
		public  static Dataset<Row> dsunion = null;
		
		StructType schema = new StructType(new StructField[] {
				new StructField("ColumnName",DataTypes.StringType, false,Metadata.empty()),
				new StructField("ColumnValue",DataTypes.FloatType, true,Metadata.empty()),
				
		});
		@Override
		public void call(Row r) throws Exception {
			String name = r.getString(0);
			String val = r.getString(1);
			if (val == null)  
				rows.add(RowFactory.create(name,null));
			else {
				String[] vals = val.split(",");
				for (String s : vals)
					rows.add(RowFactory.create(name,Float.parseFloat(s)));
				
				//System.out.println();
			}
			
			 //Dataset<Row> ds = ss.createDataset(columnValue, Encoders.STRING()).toDF();
			 Dataset<Row> ds = ss.createDataFrame(rows, schema);
			 // ds = ds.withColumnRenamed("value", name) ;
			 
			
			 if (dsunion == null)
				 dsunion = ds;
			 else
				 dsunion = dsunion.union(ds); 
			 
			 System.out.println(name);
			 System.out.println(ds.count());
			 System.out.println(dsunion.count());
			// dsunion.show();
			
		}
		
	}
	
	 public static void main(String[] args) {
		 
		String fileLoc = "C:\\Users\\VISING\\Documents\\SEQUENCEFILE.csv";
		
		System.setProperty("hadoop.home.dir", "C:\\Users\\VISING\\Downloads");
		
		SparkSession spark = SparkSession.builder().appName("myapp")
				.master("local[*]") // only for demo and testing purposes, use spark-submit instead
				.getOrCreate();
		
		
		 Dataset<Row> df = getDataFrame(spark,"csv",fileLoc,"true");
		 df = df.limit(10);
		 df.show();
		 df.printSchema();
		 List<String> colN = df.select("signalName").as(Encoders.STRING()).collectAsList();
		 //spark.sparkContext().parallelize((colN.toArra,10);
		 
//		 int i = 0;
//		 for (String s: colN) {
//			 if (i== 50) break;
//			 df = df.withColumn(s, functions.lit(null));
//		 }
		
		 Hd5toparquet.FlattenForEachFunc flatfunc = new Hd5toparquet.FlattenForEachFunc(spark);
		 df.foreach( flatfunc);
//		 List<String> value = getColumnValue();
//		 String name  = getColumnName();
//		 Dataset<Row> ds = spark.createDataset(value, Encoders.STRING()).toDF();
//		 ds = ds.withColumnRenamed("value", name) ;
//		 ds.show();
		 
		 Dataset<Row> ds  = flatfunc.dsunion;
		 
		 ds.show(false);
		 ds.printSchema();
		 
		 Scanner input = new Scanner(System.in);
	    	
	    	System.out.print("Enter the Signal to profile :");
	    	String  signame = input.nextLine();
	    	
	    	Dataset<Row> dsfilter = ds.filter(ds.col("ColumnName").equalTo(signame)).describe("ColumnValue");
	    	dsfilter.show();
	    	
	    	writeDataFrame(ds, "parquet", "C:\\\\Users\\\\VISING\\\\Documents\\\\FLATFILE.parq", "overwrite");
	    	
	    	
	    	input.close();
	    	
		 spark.stop();
	}
}


Feel free to reach me if you have any questions.

Advertisements

Why AI/ML models fail in Production

In my myriads consulting assignments, I have barely seen an AI/ML model in production. With renewed interest in AI/ML ( Artificial intelligence & Machine learning) and rightly so, enterprises are embracing for smart applications powered by models but their frustration is visible when models could not make it to production to work in tandem with applications.

Applications and models have to work together and in realtime to feed each other. Which means the models has to increasing learn with near real-time smaller data set and provide output to applications which is already in production.

Here is a summary of issues I see :-

Dirty and Stale Data : Typically the data given to data-scientists are months old and from older application where all the attributes are not properly mapped or missing. To add, data are given from raw systems which are incomplete and inconsistent.

Models are generated in silos : Data Engineers, Dev Ops, Service engineering are not involved in model planning. They do not understand model life-cycle and production need. Data-scientists hide their model under the hood, and when it is exposed to production system it falls flat.

Data at scale not considered : Many models are recursive and iterative. When you run or train with smaller dataset ( < 1M rows) it seems like a good candidate for production. When it is exposed to production level data, time and resource consumption exponentially increases. I think, it is largest cause of no-go to production.

Data Scientists not understanding production environment : Creating a model in silos with couple of thousand of data points is different, but to have in production , you need to understand integration points with other systems (applications), CI/CD, backup & HA, hardware topology and most importantly data pipeline and workflow. A model should retrain and refresh at right time and right dataset and give the output to right system.

Selection of code and technology : Data Scientists are comfortable with R/Matlab/Python while most systems in production uses C/C++ , Java, Go, MS technologies. Data Scientist’s language of choice has limitations in production.

Conclusion: The line between application and models will be getting blurred in coming days. Models will be developed as applications and applications will be developed at models. Models will copy CI/CD, Atscale, HA, Integration , real time interactions etc from applications and applications can be retrained , refreshed , self learning, reenforce learning like models.

Feel free to share your thoughts.

About author : Vivek is creator of osDQ – world’s first open source data quality and prep tool. https://sourceforge.net/projects/dataquality/

He has also open sourced apache spark based data pipeline framework.https://sourceforge.net/projects/apache-spark-osdq/

Json based Apache Spark framework

We at Arrah Technology are creator of osDQ – https://sourceforge.net/projects/dataquality/ world first open source data quality product.

We are now working on json based apache spark framework where –

  • Data processing steps can be written in JSON
  • You not need spark developer to write the code. Framework will take care of
  • Business Analyst can write JSON
  • JSON has many utilities to from smooth transfer of complex SQL queries to JSON
  • We will later add UI so that JSON building can be smooth and seamless from UI

Here is a sample JSON

{
  "sparkconfig": 
    {
      "appname": "TransformRunner",
      "master":"local[3]",
      "verbose":"true",
      "noexecutor":"3",
      "nocore":"3",
      "executormemory":"3G",
      "confparam":"spark.speculation,false,spark.hadoop.mapreduce.map.speculative,false,spark.hadoop.mapreduce.reduce.speculative,false"
    },
  "datasources": [
    {
      "name": "SampleData",
      "location":"/Users/vsingh007c/Downloads/drive-download-20180718T060717Z-001/cdc/modified.csv",
      "locationType": "static",
      "format": "csv",
      "selectedColumns": []
    },
    {
      "name": "SampleDataJDBC",
      "format": "jdbc",
      "jdbcparam":"url,jdbc:mysql://localhost:3306/mysql,driver,com.mysql.jdbc.Driver,user,root,password,root,dbtable,(select * from help_category) AS T,partitionColumn,parent_category_id,lowerBound,0,upperBound,10000,numPartitions,10",
      "selectedColumns": []
    }
  ],
  "transformations": [
    {
      "name": "SampleData",
      "type": "sampling",
      "source": "SampleData",
      "priority": 1,
      "cache": false,
      "conditions": [
        {
          "condition": "random",
          "value": ["0.20"]
        }
      ]
    },
    {
      "name": "SampleData",
      "type": "transformation",
      "source": "SampleData",
      "priority": 2,
      "cache": false,
      "conditions": [
        {
          "condition":"UDF",
          "funcname":"concat",
          "value": [ "col1","col2"],
          "column": "testcolumnUDF",
          "datatype": "string"
        },
        {
          "condition":"UDF",
          "funcname":"regex_replace",
          "value": ["col6"],
          "column": "regexcolumnUDF",
          "aggrcondition": "N.*,vivek",
          "datatype": "string"
        },
        {
          "condition":"normalise",
          "funcname":"zeroscore",
          "value": ["col10"],
          "column": "normcol10",
          "datatype": "Long"
        },
        {
          "condition":"normalise",
          "funcname":"zscore",
          "value": ["col10"],
          "column": "zcol10",
          "datatype": "Long"
        }
      ]
    },
    {
      "name": "SampleData-Filter",
      "type": "filter",
      "source": "SampleData",
      "priority": 3,
      "cache": false,
      "conditions": [
        {
          "condition": "Expression",
          "value": ["constant","col5 is null"],
          "datatype": "String"
        },
        {
          "condition": "RLIKE",
          "column":"col6",
          "value": ["constant","^N"],
          "datatype": "String"
        },
        {
          "condition": "DROPDUPLICATES",
          "value": ["constant","col6","col10"],
          "datatype": "String"
        },
        {
          "condition": "selectexpr",
          "value": ["constant","col6","col10 as none", "'5'", "col10*col10","*"],
          "datatype": "String"
        }
      ]
    },
    {
      "name": "SampleData",
      "type": "enrichment",
      "source": "SampleData",
      "priority": 4,
      "cache": false,
      "conditions": [
        {
          "condition": "addcolumns",
          "aggrcondition": "testcolumn1:trim(col10):substrcol1:substr(now(),1,4):newcolumn:concat(testcolumn1,col4)"
        }
      ]
    },
    {
      "name": "SampleData",
      "type": "join",
      "source": "SampleData,SampleData",
      "priority": 5,
      "cache": false,
      "conditions": [
        {
          "joinType": "Unionall"
        },
        {
          "joinType": "Intersect"
        }
      ]
    },
    {
      "name": "StratifiedData",
      "type": "sampling",
      "source": "SampleData",
      "priority": 6,
      "cache": false,
      "conditions": [
        {
          "condition": "stratified",
          "value": ["0.002","I","0.8"],
          "column": "col2"
        }
      ]
    },
    
    {
      "name": "Groupdata",
      "type": "aggregate",
      "source": "StratifiedData",
      "priority": 7,
      "cache": false,
      "conditions": [
        {
          "condition": "groupby",
          "value": [ "col2"],
          "aggrcondition": "col2,count"
        }
      ]
    },
    {
      "name": "SampleData",
      "type": "enrichment",
      "source": "SampleData",
      "priority": 8,
      "cache": false,
      "conditions": [
        {
          "condition": "renamecolumns",
          "aggrcondition": "testcolumn1,rename1_vivek,substrcol1,rename2,newcolumn,rename3"
        },
        {
          "condition": "reordercolumns",
          "aggrcondition": "rename1_vivek,rename2,rename3"
        },
        {
          "condition": "replacecolumns",
          "aggrcondition": "rename3:5"
        }
      ]
    },
    {
      "name": "SampleData",
      "type": "join",
      "source": "SampleData,SampleData",
      "priority": 9,
      "cache": false,
      "conditions": [
        {
          "joinType": "left_outer",
          "leftcolumn": "rename1_vivek,rename2,rename3",
          "condition": "not equal",
          "rightcolumn": "rename1_vivek,rename2",
          "dropcolumn": "rename2"
        }
      ]
    },
    {
      "name": "SampleData",
      "type": "enrichment",
      "source": "SampleData",
      "priority": 10,
      "cache": false,
      "conditions": [
        {
          "condition": "renameallcolumns",
          "aggrcondition": "sample_"
        },
        {
          "condition": "addcolumns",
          "aggrcondition": "newcol:null"
        },
        {
          "condition": "addcolumns",
          "aggrcondition": "newcol2:case when(newcol == null) then 'not null' else 'null value' END"
        },
        {
          "condition": "addcolumns",
          "aggrcondition": "newcol1:if(newcol2='not null',0,5*5)"
        }
      ]
    }
  ],
  "outputs": [
    {
      "name": "BASE_TABLE",
      "location": "/Users/vsingh007c/Downloads/",
      "sources": ["UDFData-Filter-newCol"],
      "format": "parquet",
      "savemode": "append",
      "selectedColumns": []
    }
  ]
}

Soon we will publish readme and open source the framework code. If you have any requirement/use case where we want to migrate data processing or migrate you old SQL to spark feel free to contact us at support@arrahtech.com

Artificial Intelligence: Coming to the Rescue of ITOps

https://devops.com/artificial-intelligence-coming-to-the-rescue-of-itops/

Here is an article I have written how AI can help DevOps :

According to McKinsey’s Global Institute Report of 2018, artificial intelligence (AI) has the potential to create an annual value of $3.5 billion to $5.8 billion across different industry sectors. Today, AI in finance and IT alone accounts for about $100 billion; hence, it is becoming quite the game changer in the IT world.

With the onset of cloud adoption, the world of IT DevOps has changed dramatically. The focus of ITOps is changing to an integrated, service-centric approach that maximizes business services availability. AI can help ITOps in early detection of outages, potential root cause prediction, finding systems and nodes that are susceptible to outages, average resolution time and more. This article highlights a few use cases where AI can be integrated with ITOps, simplifying day-to-day operations and making remediation more robust.

 

You can read complete article at https://devops.com/artificial-intelligence-coming-to-the-rescue-of-itops/

Data Lake Vs EDW

I often have this question asked to me – “what is difference between warehouse and data lake ?”. Since, I have worked on both, let me try to answer it. Both are done with business in mind so variation in approach is common. I am listing generic differences.

Storage : EDW is commonly stored in RDBMS either in star schema or snowflake schema. The design of schema is not fluid and often called “early binding” or “schema on write”. Datalake is primary stored on Hadoop,Cloud (S3 etc ) or Hbase / Hive if data is structured. However it completely depends on business what to use for storage. Typically cost for having 1 TB stored in RDBMS would be 10 -15K USD while in Datalake it is should be around 2-3K USD.

Purpose : EDW is created for reporting so it has concept like Cube/Olap/ Hierarchical /Roll up/ Drill down/ Aggregate awareness / Fan out/ Dimensional Navigation etc which EDW is optimized for. Datalake works as storage for all departments of enterprise. It is primary for analytics so data is kept in flat file and sensitive data with encryption. Generally, across enterprise has access to it. It is responsibility of downstream process to optimize data ( data quality, data preparation, joining, dimension reduction etc.) for their need.

Ingestion and Retrieval : Data is ingested in EDW by ETL jobs and mostly in batch mode. There is a staging area which works for cleaning, transformation and aggregation of data. Retrieval is through reporting tools or SQL.

In Datalake, data is ingested primarily by File transfer methods. It is done both, in real time and in batch mode.There is no concept of staging area in datalake, but creator should take care to make sure data is usable and it does become swamp. In my previous article, I have explained how to create a datalake.

Data retrieval can be through file download, restful APIs or generic access to big data SQL if it is structured. Datalake has both structured and unstructured data.

Metadata: One of the most distinct differences between EDW and Datalake is, extensive use of metadata in datalake. Though in EDW also, metadata is used, it is primary for tracking ETL job status and resides in the same name space as EDW, which limit it’s uses.

Datalake’s metadata is the first place of landing for enterprise users who wants to use datalake. It has almost all the informations like data type, data dictionary, time of load, probably values, data quality or transformation, if any. From metadata user decideds what to use and what is available to him. If it is not in metadata, data does not exist for user. Often, it is stored in different namespace.

Business Proposition: While EDW’s primary purpose to generate time-bound reports for executive and power users, datalake is a way to data democratization. Datalake also saves humungous amount of time which is EDW needs to give someone access.

if we want to discuss further, feel free to contact me. On my blog site I have other articles related to datalake which may help further — https://viveksingh36.wordpress.com/

Vivek Singh – data architect, open source evangelist , chief contributor of Open Source Data Quality project http://sourceforge.net/projects/dataquality/

Author of fiction book “The Reverse Journey” http://www.amazon.com/Reverse-Journey-Vivek-Kumar-Singh/dp/9381115354/

Is ML/AI for big companies only ?

Recently, a friend of mine who owns a SME ( Small and Mid Size Enterprise ) asked me – “Is ML/AI for big companies only ? Can we (SME) also benefit from Machine Learning and Artificial Intelligence wave ” ?

I asked – “why do you think you will not benefit” ?

“Most of use cases talk about ‘terabytes of data, complex algorithms and super human data scientist’ which are beyond SMEs reach. So I guess it is for big companies only” – he replied.

This may be the impression most SME owners might carry, but it is not far from truth. In fact, it is other way. The impact of ML/AI will be more pronounced on SMEs while on big companies it will average out.

Here are the reasons:

1.) Most of ML/AI algorithms needs 50-100 data points to create a model. So it is not terabytes, even KB and MB of data will be good enough for model inputs.

2.) A segment of 50-100 members is good enough to define segment behavior. So a customer base of 500+ is a good use case for segment analysis.

3.) Effect of weather or local events has larger effect on local businesses. It is easier to track impact on SMEs ; for large corporation it averages out.

4.) There are less data silos in SME so unlocking is easier. Data is well understood in SME world while in big corporations it is very complex and often riddled with politics.

5.) SME owners understand their business in totality, while executive of big corporations know only one part of business. Better business understanding and defined metrics lead to better models.

6.) Better personalized service is possible for SMEs where segment size is small and customers are local and loyalty has major impact on business.

7.) Now tools and cloud has taken cost barrier out for SMEs. They should be using AI/ML extensively to drive their business.

Vivek Singh – data architect, open source evangelist , chief contributor of Open Source Data Quality project http://sourceforge.net/projects/dataquality/

Author of fiction book “The Reverse Journey” http://www.amazon.com/Reverse-Journey-Vivek-Kumar-Singh/dp/9381115354/

Apache spark based classification and prediction model

We have open sourced apache spark based random forest and multilevel perceptron algorithms which can be used for classification and prediction. It can be downloaded from

https://sourceforge.net/projects/apache-spark-osdq/

Use cases :

a.) if you are using huge volume of data (Big data problem) which has large set feature columns. For smaller dataset, you can run apache spark in local mode.

b.) If you want run multiple multi class models together to see which one gives better result. Right now random forest and multilevel perceptron algorithms are implemented. But framework is there to take other algorithms also.

c.) No coding required. Just change the config file and good to go – both for training and classifying/predicting. All you need is java 8

d.) Restful APIs are there to predict/classify along with probability. Easily integratabtle.

e.) If you want to see the accuracy of multiple label columns for dimension reducibility

Overview: This program can be used for both training and classifying purpose. You can train the model and use RESTFul web service to query the model.This program also exposes a RESTFul web service to (jetty and javaspark based) expose classification/prediction as a service.

Install:

1. Download the package the package “spark-classifier_1.3-SNAPSHOT.zip”

2. Unzip the pre-built distribution and follow the below details

3. Understand the folder structure of release upon unzipping

* spark-classifier_\<version>

* /lib: contains all dependent jar

* /conf: contains classifier.properties, please review this file before running the program

* /model: the default model path where both model would saved (after training) and read (during classification service). You should have write access to this folder

* /spark-classifier-\<version>.jar: the main driver jar

Configuration:Currently it supports Random Forest and Multilayer Perceptron classifiers. Please set the same under “conf/classifier.properties”

# Currently supported algorithm RANDOM_FOREST or MULTILEVEL_PERCEPTRON

classifier.algorithm=MULTILEVEL_PERCEPTRON

#classifier.algorithm=RANDOM_FOREST

It takes Comma(,) separated list of columns for Feature and Label. * in label means it will take all columns to predict. It will skip feature columns if they in in predict or label column too.

classifier.featurecols=Number,Follow up

####list of labels to be predicted

#### '*' will process all the columns

classifier.labelcols=Root Cause

#classifier.labelcols=L1, L2, L3..

Train the model:

cmd > java -cp spark-classifier-<version>-SNAPSHOT.jar:lib/*:conf org.arrahtech.classifier.ClassifierTrainer

The input file name and output model location can be defined inside `conf/classifier.properties` By default, above command would assume that `conf/classifier.properties` file is correctly setup.

Use the model to predict or classify

cmd > java -cp spark-classifier-<version>-SNAPSHOT.jar:lib/*:conf org.arrahtech.service.ClassifierService

It will start default jetty server which will accept post requests. After this you may post the RESTFul API http://localhost:4567/classify/<algorithm_name>/<label_name&gt; -d jsonfile

Where \<algorithm_name> can be “randon_forest” or “multilevel_perceptron” and \<label_name> would be the label column name (column for which model was trained) in your training dataset and json file will have feature column and values which are input for prediction or classification

cmd > curl -XPOST http://localhost:4567/classify/random_forest/LABEL1 -d '[{

       "FeatureField1":"FeatureField1VALUE",

       "FeatureField2":" FeatureField2VALUE",

       "FeatureField3":" FeatureField3VALUE"}]'
> Response JSON

[{

       "classifiedLabel": "PredictedValue",

       "probability": "0.951814884316891"

 }]

Things to Remember

1.)   Presently it takes only txt file with field separator

2.)   Null is replaced by NULLVALUE as null cannot be used in model

3.)   multilevel_perceptron does not give probability of predicted value. This feature is available in latest apache spark version.

4.)   Currently label_name shouldn’t have hyphen ‘-‘ character

5.)   If there is space in label column name use ‘%20’ for space.

If you face any issue feel free to contact us or raise a bug. We are developing an open source platform for integrated data life cycle – with ingestion, DQ, Profiling, Analytics and Prediction , all in one.

About Me : Vivek Singh – data architect, open source evangelist , chief contributor of Open Source Data Quality project http://sourceforge.net/projects/dataquality/

Author of fiction book “The Reverse Journey” http://www.amazon.com/Reverse-Journey-Vivek-Kumar-Singh/dp/9381115354/