sqlflow_public/databases/sparksql
j 979e222e68 first commit 2024-10-23 17:52:59 +08:00
..
readme.md first commit 2024-10-23 17:52:59 +08:00

readme.md

SparkSQL column-level data lineage

Discover and visualization lineage from SparkSQL script. SQLFlow supports alls SQL statements of SparkSQL 3. Below are some examples that illustrate how SQLFlow works. You may try your SparkSQL query to get the lineage using the SQLFlow Cloud.

INSERT OVERWRITE DIRECTORY

INSERT OVERWRITE DIRECTORY 's3:///bucket/path/to/report'
    USING parquet
    OPTIONS (col1 1, col2 'sum')
    SELECT bar.my_flag,sum(foo.amount) as amount_sum 
	FROM mydb.foo foo 
	left join mydb.bar bar
	on foo.bar_fk = bar.pk
	group by bar.my_flag;

The lineage diagram:

sparksql insert overwrite directory

Pivot clause

CREATE TABLE person (id INT, name STRING, age INT, class INT, address STRING);

SELECT * FROM person
    PIVOT (
        SUM(age) AS a, AVG(class) AS c
        FOR name IN ('John' AS john, 'Mike' AS mike)
    );

The lineage diagram:

sparksql pivot clause

Try your SparkSQL queries using the SQLFlow Cloud Version

Programmatically using Restful APIs or SDKs to get lineage in CSV, JSON, Graphml format.

SparkSQL sapmle SQLs for reference

INSERT OVERWRITE DIRECTORY 's3:///bucket/path/to/report'
    USING parquet
    OPTIONS (col1 1, col2 'sum')
    SELECT bar.my_flag,sum(foo.amount) as amount_sum 
	FROM mydb.foo foo 
	left join mydb.bar bar
	on foo.bar_fk = bar.pk
	group by bar.my_flag;

INSERT OVERWRITE DIRECTORY
    USING parquet
    OPTIONS ('path' 's3:///bucket/path/to/report', col1 1, col2 'sum')
    SELECT bar.my_flag,sum(foo.amount) as amount_sum 
	FROM mydb.foo foo 
	left join mydb.bar bar
	on foo.bar_fk = bar.pk
	group by bar.my_flag;


create schema mydb;

create table mydb.bar(
	pk int,
	my_flag int
);
	

create table mydb.foo(
	bar_fk int,
	amount int
);

insert into mydb.bar(pk,my_flag) values(1, 100);
insert into mydb.bar(pk,my_flag) values(2, 200);
insert into mydb.bar(pk,my_flag) values(3, 300);
insert into mydb.bar(pk,my_flag) values(4, 400);
insert into mydb.foo(bar_fk,amount) values(1, 10);
insert into mydb.foo(bar_fk,amount) values(1, 20);
insert into mydb.foo(bar_fk,amount) values(2, 200);
insert into mydb.foo(bar_fk,amount) values(2, 300);
insert into mydb.foo(bar_fk,amount) values(3, 250);
insert into mydb.foo(bar_fk,amount) values(4, 350);


SELECT bar.my_flag,sum(foo.amount) as amount_sum 
FROM mydb.foo foo 
left join mydb.bar bar
on foo.bar_fk = bar.pk
group by bar.my_flag
;