200字范文 > mysql综合练习：用一个自定义函数实现两个向量之间的余弦相似度计算

mysql综合练习：用一个自定义函数实现两个向量之间的余弦相似度计算

时间：2019-10-30 03:13:05

如有问题，请联系qq：2499496272，谢谢~~

我们一起努力！

数据

id,name,age,height,weight,yanzhi,score

1,a,18.0,172.0,120.0,98.0,68.8

2,b,28,175,120,97,68.8

3,c,30,180,130,94,88.8

4,d,18,168,110,98,68.8

5,e,26,165,120,98,68.8

6,f,27,182,135,95,89.8

7,g,19,171,122,99,68.8

代码实现

import SparkUtilimport org.apache.spark.sql.Rowimport org.apache.spark.sql.expressions.UserDefinedFunctionimport scala.collection.mutable/*** UDF 案例2 ：用一个自定义函数实现两个向量之间的余弦相似度计算*/case class Human(id: Int, name: String, features: Array[Double])object CosinSimilarity {def main(args: Array[String]): Unit = {val spark = SparkUtil.getSpark()import spark.implicits._import spark.sql// 加载用户特征数据val df = spark.read.option("inferSchema", true).option("header", true).csv("data/features.csv")df.show()/*** +---+----+----+------+------+------+-----+* | id|name| age|height|weight|yanzhi|score|* +---+----+----+------+------+------+-----+* | 1| a|18.0| 172.0| 120.0| 98.0| 68.8|* | 2| b|28.0| 175.0| 120.0| 97.0| 68.8|* | 3| c|30.0| 180.0| 130.0| 94.0| 88.8|* | 4| d|18.0| 168.0| 110.0| 98.0| 68.8|* | 5| e|26.0| 165.0| 120.0| 98.0| 68.8|* | 6| f|27.0| 182.0| 135.0| 95.0| 89.8|* | 7| g|19.0| 171.0| 122.0| 99.0| 68.8|* +---+----+----+------+------+------+-----+*/// id,name,age,height,weight,yanzhi,score// 将用户特征数据组成一个向量(数组)// 方式1：df.rdd.map(row => {val id = row.getAs[Int]("id")val name = row.getAs[String]("name")val age = row.getAs[Double]("age")val height = row.getAs[Double]("height")val weight = row.getAs[Double]("weight")val yanzhi = row.getAs[Double]("yanzhi")val score = row.getAs[Double]("score")(id, name, Array(age, height, weight, yanzhi, score))}).toDF("id", "name", "features")/*** +---+----+--------------------+* | id|name| features|* +---+----+--------------------+* | 1| a|[18.0, 172.0, 120...|* | 2| b|[28.0, 175.0, 120...|* | 3| c|[30.0, 180.0, 130...|*/// 方式2：df.rdd.map({case Row(id: Int, name: String, age: Double, height: Double, weight: Double, yanzhi: Double, score: Double)=> (id, name, Array(age, height, weight, yanzhi, score))}).toDF("id", "name", "features")// 方式3: 直接利用sql中的函数array来生成一个数组df.selectExpr("id", "name", "array(age,height,weight,yanzhi,score) as features")import org.apache.spark.sql.functions._df.select('id, 'name, array('age, 'height, 'weight, 'yanzhi, 'score) as "features")// 方式4：返回case classval features = df.rdd.map({case Row(id: Int, name: String, age: Double, height: Double, weight: Double, yanzhi: Double, score: Double)=> Human(id, name, Array(age, height, weight, yanzhi, score))}).toDF()// 将表自己和自己join，得到每个人和其他所有人的连接行val joined = features.join(features.toDF("bid","bname","bfeatures"),'id < 'bid)joined.show(100,false)/*** +---+----+--------------------------------+---+-----+--------------------------------+* |id |name|features |bid|bname|bfeatures |* +---+----+--------------------------------+---+-----+--------------------------------+* |1 |a |[18.0, 172.0, 120.0, 98.0, 68.8]|2 |b |[28.0, 175.0, 120.0, 97.0, 68.8]|* |1 |a |[18.0, 172.0, 120.0, 98.0, 68.8]|3 |c |[30.0, 180.0, 130.0, 94.0, 88.8]|* |1 |a |[18.0, 172.0, 120.0, 98.0, 68.8]|4 |d |[18.0, 168.0, 110.0, 98.0, 68.8]|* |1 |a |[18.0, 172.0, 120.0, 98.0, 68.8]|5 |e |[26.0, 165.0, 120.0, 98.0, 68.8]|* |1 |a |[18.0, 172.0, 120.0, 98.0, 68.8]|6 |f |[27.0, 182.0, 135.0, 95.0, 89.8]|* |1 |a |[18.0, 172.0, 120.0, 98.0, 68.8]|7 |g |[19.0, 171.0, 122.0, 99.0, 68.8]|* |2 |b |[28.0, 175.0, 120.0, 97.0, 68.8]|3 |c |[30.0, 180.0, 130.0, 94.0, 88.8]|* |2 |b |[28.0, 175.0, 120.0, 97.0, 68.8]|4 |d |[18.0, 168.0, 110.0, 98.0, 68.8]|* |2 |b |[28.0, 175.0, 120.0, 97.0, 68.8]|5 |e |[26.0, 165.0, 120.0, 98.0, 68.8]|* |2 |b |[28.0, 175.0, 120.0, 97.0, 68.8]|6 |f |[27.0, 182.0, 135.0, 95.0,*/// 定义一个计算余弦相似度的函数// val cosinSim = (f1:Array[Double],f2:Array[Double])=>{ /* 余弦相似度 */ }// 开根号的api： Math.pow(4.0,0.5)val cosinSim = (f1:mutable.WrappedArray[Double], f2:mutable.WrappedArray[Double])=>{val fenmu1 = Math.pow(f1.map(Math.pow(_,2)).sum,0.5)val fenmu2 = Math.pow(f2.map(Math.pow(_,2)).sum,0.5)val fenzi = f1.zip(f2).map(tp=>tp._1*tp._2).sumfenzi/(fenmu1*fenmu2)}// 注册到sql引擎： spark.udf.register("cosin_sim",consinSim)spark.udf.register("cos_sim",cosinSim)joined.createTempView("temp")// 然后在这个表上计算两人之间的余弦相似度sql("select id,bid,cos_sim(features,bfeatures) as cos_similary from temp").show()// 可以自定义函数简单包装一下，就成为一个能生成column结果的dsl风格函数了val cossim2: UserDefinedFunction = udf(cosinSim)joined.select('id,'bid,cossim2('features,'bfeatures) as "cos_sim").show()spark.close()}}