pyspark

#create an app named linuxhint
spark = SparkSession.builder.appName('linuxhint').getOrCreate()

# create student data with 5 rows and 6 attributes
students =[{'rollno':'001','name':'sravan','age':23,'height':5.79,'weight':67,'address':'guntur'},
               {'rollno':'002','name':'ojaswi','age':16,'height':3.79,'weight':34,'address':'hyd'},
               {'rollno':'003','name':'gnanesh chowdary','age':7,'height':2.79,'weight':17,'address':'patna'},
               {'rollno':'004','name':'rohith','age':9,'height':3.69,'weight':28,'address':'hyd'},
               {'rollno':'005','name':'sridevi','age':37,'height':5.59,'weight':54,'address':'hyd'}]

# create the dataframe
df = spark.createDataFrame( students)



# null 값이 포함된 dataframe 만드는 두 가지 방법
data_with_null = [{'a':None, 'b':2} , {'a':1, 'b':None}]
df = spark.createDataFrame( data_with_null )

#혹은 lit(None) 을 새로운 컬럼으로 추가
from pyspark.sql.types import StringType
from pyspark.sql.functions import lit
df = df.withColumn("null_val", lit(None).cast(StringType()))

https://stackoverflow.com/questions/33038686/add-an-empty-column-to-spark-dataframe

null 값이 포함된 column 에서 null 값을 바꾸려면 fill, fillna 함수 혹은 coalesce 함수 사용
아래 링크 참고
https://sparkbyexamples.com/pyspark/pyspark-fillna-fill-replace-null-values/
https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.functions.coalesce.html


# 1이상 10미만 연속된 숫자를 갖는 샘플 데이터
df = spark.range(1, 10)

https://linuxhint.com/sum-pyspark/

 

 

scala spark

val rdd = spark.sparkContext.parallelize(Seq(
  Item(1, "Thingy A", "awesome thing.", "high", 0),
  Item(2, "Thingy B", "available at http://thingb.com", null, 0),
  Item(3, null, null, "low", 5),
  Item(4, "Thingy D", "checkout https://thingd.ca", "low", 10),
  Item(5, "Thingy E", null, "high", 12)))

val data = spark.createDataFrame(rdd)


https://github.com/awslabs/deequ

 

 

+ Recent posts