Strumieniowanie danych w Sparku

Bartosz Kowalik

O mnie

map(func)	reduceByKey(func, [numTasks])
filter(func)	aggregateByKey(zeroValue)(seqOp, combOp, [numTasks])
flatMap(func)	sortByKey([ascending], [numTasks])
mapPartitions(func)	join(otherDataset, [numTasks])
mapPartitionsWithIndex(func)	cogroup(otherDataset, [numTasks])
sample(withReplacement, fraction, seed)	cartesian(otherDataset)
union(otherDataset)	pipe(command, [envVars])
intersection(otherDataset)	coalesce(numPartitions)
distinct([numTasks]))	repartition(numPartitions)
groupByKey([numTasks])	repartitionAndSortWithinPartitions(partitioner)

reduce(func)	takeSample(withReplacement, num, [seed])
collect()	takeOrdered(n, [ordering])
count()	saveAsTextFile(path)
first()	saveAsSequenceFile(path)
take(n)	saveAsObjectFile(path)