通过toDF方法创建DataFrame
通过toDF的方法创建
- 集合rdd中元素类型是样例类的时候,转成DataFrame之后列名默认是属性名
- 集合rdd中元素类型是元组的时候,转成DataFrame之后列名默认就是_N
- 集合rdd中元素类型是元组/样例类的时候,转成DataFrame(toDF(“ID”,“NAME”,“SEX”,“AGE6”))可以自定义列名
import org.apache.spark.sql.{DataFrame, SparkSession} import org.junit.Test case class Person(id:Int,name:String,sex:String,age:Int) class TestScala { val spark = SparkSession .builder() .appName("test") .master("local[4]") .getOrCreate() import spark.implicits._ /** * 通过toDF的方法创建 * 集合rdd中元素类型是样例类的时候,转成DataFrame之后列名默认是属性名 * 集合rdd中元素类型是元组的时候,转成DataFrame之后列名默认就是_N */ @Test def createDataFrameByToDF():Unit={ //TODO 样例类是属性名 val list = List(Person(1,"zhangsan","man",10),Person(2,"zhang2","woman",66),Person(3,"zhang3","man",70),Person(4,"zhang4","man",22)) //需要隐士转换 val df:DataFrame = list.toDF() df.show() //TODO 元祖是_N val list2 = List((1,"zhangsan","man",10),(1,"zhang2","woman",66),(1,"zhang3","man",70),(1,"zhang4","man",22)) //需要隐士转换 val df1:DataFrame = list2.toDF() df1.show() //TODO 自定义属性名 val list3 = List((1,"zhangsan","man",10),(1,"zhang2","woman",66),(1,"zhang3","man",70),(1,"zhang4","man",22)) //需要隐士转换 val df2:DataFrame = list3.toDF("ID","NAME","SEX","AGE6") df2.show() } }
结果
通过读取文件创建DataFrame
json数据
{"age":20,"name":"qiaofeng"} {"age":19,"name":"xuzhu"} {"age":18,"name":"duanyu"}
/** * 通过读取文件创建 */ @Test def createDataFrame():Unit={ val df = spark.read.json("src/main/resources/user.json") df.show() }
通过createDataFrame方法创建DF
@Test def createDataFrameByMethod():Unit={ val fields = Array(StructField("id",IntegerType),StructField("name",StringType),StructField("sex",StringType),StructField("age",IntegerType)) val schema = StructType(fields) val rdd = spark.sparkContext.parallelize(List(Row(1, "zhangsan", "man", 10), Row(2, "zhang2", "woman", 66), Row(3, "zhang3", "man", 70), Row(4, "zhang4", "man", 22))) val df = spark.createDataFrame(rdd, schema) df.show() }