Hive Practice Sets — CCA159(CCA Data Analyst Exam)— Sample Questions

11 min readSep 17, 2021

[Q]: Use retail_db_orders, order_items table and find the maximum revenue generated per year. If any value is not present in either tables then show max revenue?

[A]:
CREATE EXTERNAL TABLE orders (order_id INT, order_date STRING, order_customer_id BIGINT, order_status STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ STORED AS TEXTFILE LOCATION ‘/user/data/retail_db/orders’;

CREATE EXTERNAL TABLE order_items5 (order_item_id INT,
order_item_order_id INT,
order_item_product_id INT,
order_item_quantity INT,
order_item_subtotal FLOAT,
order_item_product_price FLOAT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ STORED AS TEXTFILE LOCATION ‘/user/data/retail_db/order_items’;

select date_format(order_date,’yyyy’), NVL(MAX(order_item_subtotal),cast(‘.75’ as float)) from orders5 o LEFT OUTER JOIN order_items5 oi ON (o.order_id = oi.order_item_order_id) GROUP BY date_format(order_date,’yyyy’);

[Q]: Find yesterday’s closing price from nyse data and label as high or low in the status?
[A]:
CREATE EXTERNAL TABLE nyse (stockticker STRING, transactiondate STRING,openprice FLOAT, highprice FLOAT, lowprice FLOAT,closeprice FLOAT, volume BIGINT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ STORED AS TEXTFILE LOCATION ‘/user/data/nyse’;

WNC,19971231,28.37,28.62,28.25,28.44,70700

WITH x AS (select stockticker, unix_timestamp(transactiondate,’yyyyMMdd’) dt, closeprice, LAG(closeprice,1) OVER (PARTITION BY stockticker ORDER BY unix_timestamp(transactiondate,’yyyyMMdd’)) yesterdaycloseprice from nyse)
SELECT stockticker, dt, closeprice, yesterdaycloseprice, CASE WHEN closeprice > yesterdaycloseprice THEN ‘High’ ELSE ‘Low’ END from x LIMIT 10;

[Q]: Get word count-
Docs file (Input File)
Mary had a little lamb
its fleece was white as snow
and everywhere that Mary went
the lamb was sure to go.

[A]:
CREATE TABLE FILES (line STRING);
LOAD DATA LOCAL INPATH ‘/home/data/question/3’ OVERWRITE INTO TABLE FILES;

CREATE TABLE word_counts AS
SELECT word, count(1) AS count FROM
(SELECT explode(split(line, ‘ ‘)) AS word FROM FILES) w
GROUP BY word
ORDER BY word;
Output of word_counts table in Hive CREATE TABLE word_counts AS
SELECT word, count(1) AS count FROM
(SELECT explode(split(line, ‘ ‘)) AS word FROM FILES) w
GROUP BY word
ORDER BY word;

[A]:
CREATE TABLE cards (cardnoinchar string, cardprnt string, cardno int)
partitioned by (cardcolor string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘|’ ;
LOAD DATA INPATH ‘/user/data/cards.txt’ OVERWRITE INTO TABLE cards;

[Q]: Add a new column to order_items namely year_val. Then update the value of column year using the values in orders table.

[A]:
ALTER TABLE order_items ADD COLUMNS (year_val STRING);

INSERT OVERWRITE
TABLE order_items
SELECT oi.order_item_id
, oi.order_item_order_id
, oi.order_item_product_id
, oi.order_item_quantity
, oi.order_item_subtotal
, oi.order_item_product_price
, o.order_date
FROM order_items oi
JOIN orders o ON (o.order_id = oi.order_item_order_id);

[Q]: Create a table with transactional control, able to support updates.
[A]:
set hive.support.concurrency=true;
set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
create table test1 ( a int, b int, c int, d string) STORED AS ORC TBLPROPERTIES (‘transactional’ = ‘true’);
insert into table test1 values (1,2,3,’A’),(10,20,30,’B’);
update test1 set d=’M’ where a =1

[Q]: Please accomplish following activities
1. Create a table region with following structure. However, underline file format should be parquet in HDFS with Snappy compression.
r_regionkey smallint,
r_name string,
r_comment string,
r_nations array<struct<n_nationkey:smallint,n_name:string,n_comment:string>>

3. Now Calculate number of nation keys, total of nation keys, average of nation keys, minimum and maximum of nation name also find count of distinct nation.
This all calculation should be region specific.

[A]:
Location: /tmp/<<name>>/hive/region.csv
— DROP TABLE region;
CREATE TABLE stg_region
(
r_regionkey smallint,
r_name string,
r_comment string,
r_nations array<struct<n_nationkey:smallint,n_name:string,n_comment:string>>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘|’ map keys terminated by ‘,’;

ALTER TABLE stg_region SET TBLPROPERTIES (“skip.header.line.count”=”1");
LOAD DATA LOCAL INPATH ‘/tmp/<<name>>/hive/region.csv’ INTO TABLE stg_region;

Alternative 1:
CREATE TABLE region1
(
r_regionkey smallint,
r_name string,
r_comment string,
r_nations array<struct<n_nationkey:smallint,n_name:string,n_comment:string>>
)
STORED AS PARQUET
tblproperties(‘PARQUET.COMPRESSION’=’SNAPPY’);

INSERT INTO region1 SELECT * FROM stg_region;

Alternative 2:

CREATE TABLE region
STORED AS PARQUET
tblproperties(‘PARQUET.COMPRESSION’=’SNAPPY’) AS
SELECT * FROM stg_region;

Alternative 3:
set parquet.compress=SNAPPY;

show create table region;

describe formatted region;

REPORT:
SELECT r_regionkey, COUNT(nations_details.n_nationkey),SUM(nations_details.n_nationkey), AVG(nations_details.n_nationkey),MAX(nations_details.n_name),MIN(nations_details.n_name) FROM region LATERAL VIEW explode(r_nations) exploded_table as nations_details GROUP BY r_regionkey;

[Q]: You have given following table ,
HadoopExamInt (value int, property string);
1. Please transform all the values from value column in a single row , separated by ‘|’.
2. Please transform odd and even values (separately) from value column in a single row , separated by ‘|’ also save this results in a table named “problem7”.

[A]:
CREATE TABLE hadoopexamint (value int, property string);

INSERT INTO hadoopexamint
VALUES
(1,’Odd’)
,(2,’Even’)
,(3,’Odd’)
,(4,’Even’)
,(5,’Odd’)
,(6,’Even’)
,(7,’Odd’)
,(8,’Even’)
,(9,’Odd’)
,(10,’Even’)
,(11,’Odd’)
,(12,’Even’)
,(13,’Odd’)
,(14,’Even’)
,(15,’Odd’)
,(16,’Even’)
,(17,’Odd’)
,(18,’Even’)
,(19,’Odd’)
,(20,’Even’)
,(21,’Odd’)
,(22,’Even’)

1. select concat_ws(‘|’,collect_list(cast(value AS string))) from hadoopexamint;
2. CREATE TABLE problem7 AS select property, concat_ws(‘|’,collect_list(cast(value AS string))) from hadoopexamint group by property;

[Q]: You have been given below table.
HadoopExamInt (value int, property string);
Calculate variance, for sample and entire population. As well as `standard_deviation` for decimal value 7,4

[A]:
CREATE TABLE hadoopexamint (value int, property string);

HIVE> select variance(value) from hadoopexamint;
HIVE> select var_samp(value) from hadoopexamint;
HIVE> select var_pop(value) from hadoopexamint;
HIVE> select cast(stddev(value) as decimal(7,4)) from hadoopexamint;

[Q]: Create a table as below.
Table Name : HADOOPEXAM_STOCK
Columns : (exchange string, stock_symbol string, closing_date timestamp, closing_price decimal(8,2));
Please insert below values.
INSERT INTO hadoopexam_stock VALUES
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘09/08/09’,”dd/MM/yy”)),1),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘10/08/09’,”dd/MM/yy”)),2),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘11/08/09’,”dd/MM/yy”)),3),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘12/08/09’,”dd/MM/yy”)),4),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘13/08/09’,”dd/MM/yy”)),5),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘14/08/09’,”dd/MM/yy”)),6),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘15/08/09’,”dd/MM/yy”)),7),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘16/08/09’,”dd/MM/yy”)),8),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘17/08/09’,”dd/MM/yy”)),9),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘18/08/09’,”dd/MM/yy”)),10),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘19/08/09’,”dd/MM/yy”)),11),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘20/08/09’,”dd/MM/yy”)),12),
(‘NSE’,’TCS’,from_unixtime(unix_timestamp(‘21/08/09’,”dd/MM/yy”)),13),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘09/08/09’,”dd/MM/yy”)),14),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘10/08/09’,”dd/MM/yy”)),15),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘11/08/09’,”dd/MM/yy”)),16),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘12/08/09’,”dd/MM/yy”)),17),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘13/08/09’,”dd/MM/yy”)),18),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘14/08/09’,”dd/MM/yy”)),19),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘15/08/09’,”dd/MM/yy”)),20),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘16/08/09’,”dd/MM/yy”)),21),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘17/08/09’,”dd/MM/yy”)),22),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘18/08/09’,”dd/MM/yy”)),23),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘19/08/09’,”dd/MM/yy”)),24),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘20/08/09’,”dd/MM/yy”)),25),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘21/08/09’,”dd/MM/yy”)),26),
(‘NSE’,’INFY’,from_unixtime(unix_timestamp(‘21/08/09’,”dd/MM/yy”)),27);

- Write a Query to compute moving averages of the closing price, with 3-day span.
- Write a Query to compute moving averages of the closing price, with 3-day span, 1 preceeding and 1 following
- Write a query to compute a cumulative moving average, from the earliest data up to the value for each day.

[A]:
create table hadoopexam_stock( exchange1 string, stock_symbol string, closing_date timestamp, closing_price decimal(8,2));

select exchange1, stock_symbol, closing_date, closing_price, AVG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY closing_date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_average from hadoopexam_stock;

select exchange1, stock_symbol, closing_date, AVG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY closing_date ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) as moving_avg from hadoopexam_stock;

select exchange1, stock_symbol, closing_date, closing_price, AVG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY closing_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as moving_average from hadoopexam_stock;

[Q]: As you have been given below table.

HADOOPEXAM_STOCK (exchange string, stock_symbol string, closing_date timestamp, closing_price decimal(8,2))
1. Write a query , which generate report as below.
exchange, stock_symbol , closing_date , closing_price , yesterday_closing , diff_yesterday_price (Price difference between Yesterdays price and todays price)

2. Write a query , which generate report as below.
exchange, stock_symbol , closing_date , closing_price , yesterday_closing , diff_last7days_price (Price difference between last 7days price and todays price)

3. Write a query which will print, whether following day price is higher , lower . Output should be like this.
exchange,stock_symbol, closing_date, closing_price, position

[A]:
1) select exchange1, stock_symbol, closing_date, closing_price, yesterday_closing, yesterday_closing — closing_price AS diff FROM (select exchange1, stock_symbol, closing_date, closing_price, LAG(closing_price) OVER (PARTITION BY stock_symbol ORDER BY closing_date) as yesterday_closing from hadoopexam_stock) tab;

2) select exchange1, stock_symbol, closing_date, closing_price, yesterday_closing, yesterday_closing — closing_price AS last_7_day_diff FROM (select exchange1, stock_symbol, closing_date, closing_price, LAG(closing_price,7) OVER (PARTITION BY stock_symbol ORDER BY closing_date) as yesterday_closing from hadoopexam_stock) tab;

3) select tab.*, CASE WHEN diff > 0 THEN ‘Higher’ ELSE ‘LOWER’ END FROM (select exchange1, stock_symbol, closing_date, closing_price — LAG(closing_price,1) OVER (PARTITION BY stock_symbol ORDER BY closing_date) as diff from hadoopexam_stock) tab;

[Q]:
Create a table emp as below.
empno int,
ename string,
job string,
mgr int,
hiredate timestamp,
sal double,
coMM double,
deptno int

Once table is created as insert statment as below.
7698, ‘BLAKE’, ‘MANAGER’, 7839, ‘01–05–1981’, 2850, null, 30
7782, ‘CLARK’, ‘MANAGER’, 7839, ‘09–06–1981’, 2450, null, 10
7566, ‘JONES’, ‘MANAGER’, 7839, ‘02–04–1981’, 2975, null, 20
7788, ‘SCOTT’, ‘ANALYST’, 7566, ‘13–07–1987 , 3000, null, 20
7902, ‘FORD’, ‘ANALYST’, 7566, ‘03–12–1981’, 3000, null, 20
7369, ‘SMITH’, ‘CLERK’, 7902, ‘17–12–1980’, 800, null, 20
7499, ‘ALLEN’, ‘SALESMAN’, 7698, ‘20–02–1981’, 1600, 300, 30
7521, ‘WARD’, ‘SALESMAN’, 7698, ‘22–02–1981’, 1250, 500, 30
7654, ‘MARTIN’, ‘SALESMAN’, 7698, ‘28–09–1981’, 1250, 1400, 30
7844, ‘TURNER’, ‘SALESMAN’, 7698, ‘08–09–1981’, 1500, 0, 30
7876, ‘ADAMS’, ‘CLERK’, 7788, ‘13–07–1987’, 1100, null, 20
7900, ‘JAMES’, ‘CLERK’, 7698, ‘03–12–1981’, 950, null, 30
7934, ‘MILLER’, ‘CLERK’, 7782, ‘23–01–1982’, 1300, null, 10

Create a dept table as below:
deptno int,
dname string,
loc string

Insert below data in the table.
10, ‘ACCOUNTING’, ‘NEW YORK’
20, ‘RESEARCH’, ‘DALLAS’
30, ‘SALES’, ‘CHICAGO’
40, ‘OPERATIONS’, ‘BOSTON’

Now accomplish following activities.
1. Please denormalize data between DEPT and EMP tables based on the for the dame DEPTNO.
2. Count the number of employees in each department, and print depart name and its count.

[A]:

SELECT * FROM emp e JOIN dept d ON (e.deptno = d.deptno);

CREATE TABLE stg_emp
( empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int
);

CREATE TABLE emp
( empno int,
ename string,
job string,
mgr int,
hiredate timestamp,
sal double,
comm double,
deptno int
);

INSERT INTO stg_emp VALUES
(7698, ‘BLAKE’, ‘MANAGER’, 7839, ‘01–05–1981’, 2850, null, 30 ),
(7782, ‘CLARK’, ‘MANAGER’, 7839, ‘09–06–1981’, 2450, null, 10 ),
(7566, ‘JONES’, ‘MANAGER’, 7839, ‘02–04–1981’, 2975, null, 20 ),
(7788, ‘SCOTT’, ‘ANALYST’, 7566, ‘13–07–1987’, 3000, null, 20 ),
(7902, ‘FORD’, ‘ANALYST’, 7566, ‘03–12–1981’, 3000, null, 20 ),
(7369, ‘SMITH’, ‘CLERK’, 7902, ‘17–12–1980’, 800, null, 20 ),
(7499, ‘ALLEN’, ‘SALESMAN’, 7698, ‘20–02–1981’, 1600, 300, 30 ),
(7521, ‘WARD’, ‘SALESMAN’, 7698, ‘22–02–1981’, 1250, 500, 30 ),
(7654, ‘MARTIN’, ‘SALESMAN’, 7698, ‘28–09–1981’, 1250, 1400, 30 ),
(7844, ‘TURNER’, ‘SALESMAN’, 7698, ‘08–09–1981’, 1500, 0, 30 ),
(7876, ‘ADAMS’, ‘CLERK’, 7788, ‘13–07–1987’, 1100, null, 20 ),
(7900, ‘JAMES’, ‘CLERK’, 7698, ‘03–12–1981’, 950, null, 30 ),
(7934, ‘MILLER’, ‘CLERK’, 7782, ‘23–01–1982’, 1300, null, 10 );

INSERT INTO emp
SELECT empno, ename, job, mgr, from_unixtime(unix_timestamp(hiredate,’DD-MM-YYYY’)), sal, comm, deptno FROM stg_emp;

CREATE TABLE dept
(
deptno int,
dname string,
loc string
);

INSERT INTO dept VALUES
(10, ‘ACCOUNTING’, ‘NEW YORK’),
(20, ‘RESEARCH’, ‘DALLAS’),
(30, ‘SALES’, ‘CHICAGO’),
(40, ‘OPERATIONS’, ‘BOSTON’);

[Q]:
Use tables created from previous exercise
Create a static Partitioned tables, based on order_id and all the fields must be separated by ‘~’ :

1. Add two static partitions 8999 and 9000.
2. Load all the orders which has order id <9000 in partition 8999.
3. Load all the orders which has order id > 9000 in partition 9000.

[A]:
CREATE TABLE orders_stg (order_id_no_use int, order_date STRING,order_customer_id BIGINT,order_status STRING);

INSERT INTO TABLE orders PARTITION (order_id=8999) VALUES
(1,’2013–07–25 00:00:00.0',11599,’CLOSED’),
(8996,’2013–07–25 00:00:00.0',256,’PENDING_PAYMENT’),
(8997,’2013–07–25 00:00:00.0',12111,’COMPLETE’),
(8998,’2013–07–25 00:00:00.0',8827,’CLOSED’),
(9000,’2013–07–25 00:00:00.0',11318,’COMPLETE’),
(9001,’2013–07–25 00:00:00.0',7130,’COMPLETE’),
(9002,’2013–07–25 00:00:00.0',4530,’COMPLETE’),
(9003,’2013–07–25 00:00:00.0',2911,’PROCESSING’),
(9004,’2013–07–25 00:00:00.0',5657,’PENDING_PAYMENT’);

CREATE TABLE orders (order_id_no_use int, order_date STRING,order_customer_id BIGINT,order_status STRING) PARTITIONED BY (order_id INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘~’;

alter table orders ADD PARTITION (order_id=8999);
alter table orders ADD PARTITION (order_id=9000);

INSERT INTO TABLE orders PARTITION (order_id=8999) SELECT * FROM orders_stg WHERE order_id <=8999;
INSERT INTO TABLE orders PARTITION (order_id=9000) SELECT * FROM orders_stg WHERE order_id > 9000;

INSERT INTO TABLE orders PARTITION (order_id=8999) VALUES (1,’2013–07–25 00:00:00.0',11599,’CLOSED’);
INSERT INTO TABLE orders PARTITION (order_id=8999) VALUES (8996,’2013–07–25 00:00:00.0',256,’PENDING_PAYMENT’);
INSERT INTO TABLE orders PARTITION (order_id=8999) VALUES (8997,’2013–07–25 00:00:00.0',12111,’COMPLETE’);
INSERT INTO TABLE orders PARTITION (order_id=8999) VALUES (8998,’2013–07–25 00:00:00.0',8827,’CLOSED’);

INSERT INTO TABLE orders PARTITION (order_id=9000) VALUES
(9000,’2013–07–25 00:00:00.0',11318,’COMPLETE’),
(9001,’2013–07–25 00:00:00.0',7130,’COMPLETE’),
(9002,’2013–07–25 00:00:00.0',4530,’COMPLETE’),
(9003,’2013–07–25 00:00:00.0',2911,’PROCESSING’),
(9004,’2013–07–25 00:00:00.0',5657,’PENDING_PAYMENT’);

[Q]: Please accomplish following
1. Get number of orders by order_status for a given date ‘2013–12–14’
2. Get number of completed orders for each date before ‘2013–12–14 00:00:00.0’
3. Get number of pending, review and onhold order for each date for the month of 2013 December.
4. Get number of orders by order_status, which in review, hold or in pending status for all the orders placed between ‘2013–12–01 00:00:00.0’ AND ‘2013–12–31 00:00:00.0’

[A]:
CREATE TABLE orders3 (order_id int, order_date STRING,order_customer_id BIGINT,order_status STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’;
LOAD DATA LOCAL INPATH ‘/home/itv948027/data/orders/’ OVERWRITE INTO TABLE orders3;

1) select order_status, COUNT(1) FROM orders3 WHERE order_date LIKE ‘2013–12–14%’ GROUP BY order_status;

2) select order_date, count(1) from orders3 where date_format(order_date,’yyyy-MM-dd HH:mm:ss.S’) < ‘2013–12–14’ and order_status =’COMPLETE’ GROUP BY order_date;

3) select order_date, count(1) from orders3 where order_date like ‘2013–12%’ and order_status IN (‘PENDING’, ‘PAYMENT_REVIEW’,’ON_HOLD’) GROUP BY order_date;

[Q]:
You have been given below patient data information in a file called patient.csv

Please accomplish following activities
1. Load this csv file in hdfs.
2. Create database in Hive named as “PatientInfo”
3. Create a Hive table in following format name PatientDetail

Name &lt; First Name, Last Name>
Address&lt; HouseNo, LocalityName, City, Zip>
Phone &lt; Mobile,Landline>

4. Make sure hive table store data in a Sequence File format.
5. Location of data , file created should be
/user/hive/warehouse/patient/

[A]:
CREATE DATABASE PatientInfo;

CREATE TABLE stg_patientdetail
(
first_name STRING,
last_name STRING,
address STRING,
mobile BIGINT,
phone BIGINT,
zipcode BIGINT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘|’ STORED AS TEXTFILE;

LOAD DATA INPATH ‘/tmp/hive/cca159/problem29.csv’ INTO TABLE STG_PatientDetail;

CREATE TABLE PatientDetail
(
first_name STRING,
last_name STRING,
address STRING,
mobile BIGINT,
phone BIGINT,
zipcode BIGINT
) STORED AS SEQUENCEFILE;

insert into PatientDetail select * from stg_PatientDetail;

[Q]:
You have been given below data format

FirstName,LastName,EMPID,LoggedInDate,JoiningDate,DeptId
Ajit,Singh,101,20131206,20131207,hadoopexamITDEPT
Arun,Kumar,102,20131206,20110607,hadoopexamPRODDEPT
Ajit,Singh,101,20131209,20131207,hadoopexamITDEPT
Ajit,Singh,101,201312011,20131207,hadoopexamITDEPT
Ajit,Singh,101,201312012,20131207,hadoopexamITDEPT
Ajit,Singh,101,201312013,20131207,hadoopexamITDEPT
Ajit,Singh,101,20131216,20131207,hadoopexamITDEPT
Ajit,Singh,101,20131217,20131207,hadoopexamITDEPT
Arun,Kumar,102,20131206,20110607,hadoopexamPRODDEPT
Arun,Kumar,102,20131209,20110607,hadoopexamPRODDEPT
Arun,Kumar,102,20131210,20110607,hadoopexamPRODDEPT
Arun,Kumar,102,20131211,20110607,hadoopexamPRODDEPT
Arun,Kumar,102,20131212,20110607,hadoopexamPRODDEPT
Arun,Kumar,102,20131213,20110607,hadoopexamMARKETDEPT
Arun,Kumar,102,20131214,20110607,hadoopexamMARKETDEPT

Remove duplicate records from this file ignoring LoggedInDate.
In output you can have any LoggedInDate, does not matter. And store final result in a Hive table.

[A]:
CREATE TABLE stg_problem32
(
firstname STRING,
lastname STRING,
empid INT,
loggedindate STRING,
joiningdate STRING,
deptid STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ STORED AS TEXTFILE
tblproperties (“skip.header.line.count”=”1");

LOAD DATA LOCAL INPATH ‘/tmp/cca159/problem32.csv’ INTO TABLE stg_problem32;

CREATE TABLE IF NOT EXISTS tbl_problem32
(
firstname STRING,
lastname STRING,
empid INT,
joiningdate STRING,
deptid STRING
);

INSERT INTO TABLE stg_problem32 VALUES
(‘Ajit’,’Singh’,101,’20131206',’20131207',’hadoopexamITDEPT’),
(‘Arun’,’Kumar’,102,’20131206',’20110607',’hadoopexamPRODDEPT’),
(‘Ajit’,’Singh’,101,’20131209',’20131207',’hadoopexamITDEPT’),
(‘Ajit’,’Singh’,101,’20131211',’20131207',’hadoopexamITDEPT’),
(‘Ajit’,’Singh’,101,’20131212',’20131207',’hadoopexamITDEPT’),
(‘Ajit’,’Singh’,101,’20131213',’20131207',’hadoopexamITDEPT’),
(‘Ajit’,’Singh’,101,’20131216',’20131207',’hadoopexamITDEPT’),
(‘Ajit’,’Singh’,101,’20131217',’20131207',’hadoopexamITDEPT’),
(‘Arun’,’Kumar’,102,’20131206',’20110607',’hadoopexamPRODDEPT’),
(‘Arun’,’Kumar’,102,’20131209',’20110607',’hadoopexamPRODDEPT’),
(‘Arun’,’Kumar’,102,’20131210',’20110607',’hadoopexamPRODDEPT’),
(‘Arun’,’Kumar’,102,’20131211',’20110607',’hadoopexamPRODDEPT’),
(‘Arun’,’Kumar’,102,’20131212',’20110607',’hadoopexamPRODDEPT’),
(‘Arun’,’Kumar’,102,’20131213',’20110607',’hadoopexamMARKETDEPT’),
(‘Arun’,’Kumar’,102,’20131214',’20110607',’hadoopexamMARKETDEPT’);

SELECT firstname, lastname, empid, MAX(loggedindate) loggedindate, joiningdate, deptid from stg_problem32 GROUP BY firstname, lastname, empid, joiningdate, deptid;

[Q]: You have been given data as below.

Accomplish following activities.

- Load this data in HDFS
- Define Hive Managed table which partitioned by Advertised Host and Advertised Country
- Table must containg following columns
ID,DATE,PUBID,ADVERTISERID,KEYWORD

[A]:
1) Create a file named problem35.csv
2) Create table in hive

CREATE TABLE IF NOT EXISTS stg_problem35 ( id INT,
url STRING,
advdate STRING,
pubid STRING,
advertiserid STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘,’ STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH ‘/tmp/cca159/problem35.csv’ INTO TABLE stg_problem35;

CREATE TABLE IF NOT EXISTS tbl_problem35
( id INT,
advdate STRING,
pubid STRING,
advertiserid STRING,
keyword STRING
)
PARTITIONED BY (advhost STRING, advcountry STRING);

set hive.exec.dynamic.partition.mode=nonstrict;
INSERT INTO tbl_problem35 PARTITION(advhost, advcountry)
SELECT id, advdate, pubid, advertiserid, SUBSTR(url,INSTR(url,”keyword=”)+8, INSTR(url,”&amp”) — INSTR(url,”keyword=”)-8) keyword, SUBSTR(url,INSTR(url,”http://”)+7,INSTR(url,”.com”) — INSTR(url,”http://”) — 7) as advhost, SUBSTR(url,INSTR(url,”country=”)+8,INSTR(url,”#Ref1") — INSTR(url,”country=”) — 8) as advcountry FROM stg_problem35;

Hive Practice Sets — CCA159(CCA Data Analyst Exam)— Sample Questions

Written by Manas Yadav