-
Notifications
You must be signed in to change notification settings - Fork 2
/
people.py
37 lines (21 loc) · 956 Bytes
/
people.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#import sys
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
sc = SparkContext(appName="people")
sqlContext = SQLContext(sc)
path = "/app/sys/ra/wgs"
# Load a text file and convert each line to a dictionary.
lines = sc.textFile(path+"/people.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
# Infer the schema, and register the SchemaRDD as a table.
schemaPeople = sqlContext.inferSchema(people)
schemaPeople.registerTempTable("people")
# SQL can be run over SchemaRDDs that have been registered as a table.
#teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
# The results of SQL queries are RDDs and support all the normal RDD operations.
#teenNames = teenagers.map(lambda p: "Name: " + p.name)
#for teenName in teenNames.collect():
# print teenName
parquetFile = schemaPeople.saveAsParquetFile(path+"/output")
sc.stop()