SQL
Datatypes
-
CHAR(n)
- always has a value with a length of 2 bytes
CHAR(2)
-
VARCHAR(n)
- can have a value with a max length of 30
VARCHAR(30)
-
BIT(n)
-
DECIMAL(p, s):
p: Precision (max length)
s: Scale (places to right of decimal)
-
FLOAT(p, s)
-
DOUBLE PRECISION(p, s)
-
REAL(s)
-
INTEGER
-
BIGINT
Dates & Time
-
DATE
-
TIME
-
DATETIME
-
TIMESTAMP
Elements of a Date in SQL
-
YEAR
-
MONTH
-
DAY
-
HOUR
-
MINUTE
-
SECOND
Commands
SELECT
- To select distinct values:
| SELECT DISTINCT country FROM customercare LIMIT 10;
|
- To count the number of distinct values:
| SELECT COUNT(DISTINCT country) FROM customercare;
|
CREATE
| CREATE TABLE table_name (
column1 datatype,
column2 datatype,
column3 datatype,
....
);
|
| SELECT food_name FROM food
WHERE food_id in (1, 2, 3, 4);
|
- To create table using another table:
| CREATE TABLE new_table_name AS
SELECT column1, column2,...
FROM existing_table_name
WHERE ....;
|
| CREATE TABLE table_name (
column1 datatype constraint,
column2 datatype constraint,
column3 datatype constraint,
....
);
|
- Various constraints are:
NOT NULL
UNIQUE
PRIMARY KEY
FOREIGN KEY
CHECK
DEFAULT
CREATE INDEX
VIEW
- It is a virtual table based on the result-set of an SQL statement.
| CREATE VIEW view_name AS
SELECT column1, column2, ...
FROM table_name
WHERE condition;
|
INSERT
| INSERT INTO table_name (column1, column2, column3, ...)
VALUES (value1, value2, value3, ...);
|
OR
| INSERT INTO table_name [(column1, column2)]
SELECT column1, column2,...
FROM existing_table_name
WHERE ....;
|
OFFSET
| SELECT *
FROM callers
ORDER BY call_received DESC
OFFSET 10
LIMIT 5;
|
ALTER
ADD column
| ALTER TABLE table_name
ADD column_name datatype;
|
DROP column
| ALTER TABLE table_name
DROP COLUMN column_name;
|
RENAME column
| ALTER TABLE table_name
RENAME COLUMN old_name to new_name;
|
MODIFY datatype
| ALTER TABLE table_name
MODIFY COLUMN column_name datatype;
|
UPDATE
| UPDATE table_name
SET col_1 = val_1, col_2 = val_2
WHERE condition;
|
Warning
WHERE is important here. If it is omitted, then all the records are updated.
DELETE
| DELETE FROM table_name
WHERE condition;
|
ORDER BY
| SELECT * FROM products
ORDER BY price DESC;
|
| SELECT * FROM products
ORDER BY price DESC, amount DESC: -- if prices is same, then desc with amount
|
Info
Some mathematical ones:
Tip
Test for NULL using IS NULL or IS NOT NULL.
INDEX
| CREATE INDEX index_name
ON table_name (column1, column2, ...);
|
SAVEPOINT
| savepoint s1; -- a checkpoint called s1 is created which can be rollbacked to
|
COMMIT
ROLLBACK
| rollback to s1; -- just doing rollback will rollback to the last commit
|
LIKE
-
%: represents zero, one or multiple characters
-
_: represents one, single character
| -- starting with a
SELECT * FROM customers
WHERE customername LIKE 'a%';
|
| -- name is L _ _ _ N
SELECT * FROM customers
WHERE customername LIKE 'L___N';
|
| -- return if any one char inside [] matches
SELECT * FROM customers
WHERE customername LIKE '[bsp]%'
|
BETWEEN
- It is inclusive: begin and end values are included.
| SELECT * FROM Products
WHERE Price BETWEEN 10 AND 20;
|
HAVING
WHERE clause cannot be used with aggregate functions
| SELECT COUNT(CustomerID), Country
FROM Customers
GROUP BY Country
HAVING COUNT(CustomerID) > 5;
|
CASE
- The
CASE expression goes through conditions and returns a value when the first condition is met (like an if-then-else statement).
- So, once a condition is true, it will stop reading and return the result.
- If no conditions are true, it returns the value in the ELSE clause.
- If there is no ELSE part and no conditions are true, it returns NULL.
| CASE
WHEN condition1 THEN result1
WHEN condition2 THEN result2
WHEN conditionN THEN resultN
ELSE result
END;
|
| SELECT CustomerName, City, Country
FROM Customers
ORDER BY
(CASE
WHEN City IS NULL THEN Country
ELSE City
END);
|
JOINS
-
Referential Integrity!
-
Basic types of joins:
Tip
There are some certain character functions like:
-
LENGTH()
-
UPPER()
-
LOWER()
-
SUBSTR()
-
ROUND()
DATE & TIME
DATE
- stores data literals
- formatted as:
YYYY-MM-DD
TIME
- stores time literals
- formatted as:
HH:MI:SS.nn
TIMESTAMP
- stores data and time literals
- formatted as:
YYYY-MM-DD HH:MI:SS.nn
CURRENT_DATE
CURRENT_TIME
CURRENT_TIMESTAMP/NOW()
EXTRACT()
- extracts a specific component from a date or timestamp
| SELECT
message_id,
sent_date,
EXTRACT(YEAR FROM sent_date) AS extracted_year,
EXTRACT(MONTH FROM sent_date) AS extracted_month,
FROM messages
LIMIT 3;
|
INTERVALS
- handle date and time gaps by adding and subtracting intervals such as "3 days", "5 hours"
| SELECT
message_id,
sent_date,
sent_date + INTERVAL '2 days' AS add_2days,
sent_date - INTERVAL '3 days' AS minus_3days,
sent_date + INTERVAL '2 hours' AS add_2hours,
sent_date - INTERVAL '10 minutes' AS minus_10mins
FROM messages
LIMIT 3;
|
TO_CHAR()
- converts a date or timestamp to a string with a specified format
| SELECT
message_id,
sent_date,
TO_CHAR(sent_date, 'YYYY-MM-DD HH:MI:SS') AS formatted_iso8601,
TO_CHAR(sent_date, 'YYYY-MM-DD HH:MI:SS AM') AS formatted_12hr,
TO_CHAR(sent_date, 'Month DDth, YYYY') AS formatted_longmonth,
TO_CHAR(sent_date, 'Mon DD, YYYY') AS formatted_shortmonth,
TO_CHAR(sent_date, 'DD Month YYYY') AS formatted_daymonthyear,
TO_CHAR(sent_date, 'Month') AS formatted_dayofmonth,
TO_CHAR(sent_date, 'Day') AS formatted_dayofweek
FROM messages
LIMIT 3;
|
Tip
Casting can also be like:
::DATE
::TIMSTAMP
::FLOAT
Aggregate Functions
- return a single value
- often used by
GROUP BY clause
- The
GROUP BY clause splits the result-set into groups of values and the aggregate function can be used to return a single value for each group.
-
ignore NULL values
-
COUNT: count non-null values in a column
-
SUM: calculate sum of all values in a column
-
MAX: return the max value in a column
-
MIN: return the min value in a column
| -- smallest price for each category in the products table
SELECT MIN(Price) AS SmallestPrice, CategoryID
FROM Products
GROUP BY CategoryID;
|
AVG: calculate the average of non-null values in a column
| SELECT AVG(Price) AS AveragePrice, CategoryID
FROM Products
GROUP BY CategoryID;
|
FILTER(): filters the input data to an aggregate function
| SELECT COUNT(ID)
FILTER(WHERE ID!=2),
AVG(LENGTH(NAME))
FILTER(WHERE LENGTH(NAME)>4)
FROM TEST;
|
NULL Functions
IFNULL()
- To return something else, if value is null.
| SELECT ProductName, UnitPrice * (UnitsInStock + IFNULL(UnitsOnOrder, 0))
FROM Products;
|
COALESCE()
| SELECT ProductName, UnitPrice * (UnitsInStock + COALESCE(UnitsOnOrder, 0))
FROM Products;
|
| Another example |
|---|
| SELECT
COUNT(
CASE
WHEN device_type = 'laptop' THEN 1
END) AS laptop_views,
COUNT(
CASE
WHEN device_type in ('tablet', 'phone') THEN 1
END) AS mobile_views
FROM viewership;
|
Tip
Difference between COALESCE and IFNULL lies in the number of arguments they can take. COALESCE takes multiple arguments but IFNULL handles two arguments.
String Functions
UPPER('avenger')/LOWER()
LEFT('avenger', 4)/RIGHT(): Extract the initial n characters
LENGTH()
POSITION('man' IN 'Ironman'): Returns position
TRIM('x')/LTRIM('x')/RTRIM('x')/BTRIM('x')
CONCAT('a', 'b', 'c'): Concatenate
CONCAT_WS(seperator, string1, string2): Concatenate with seperator
SUBSTRING(string, start_pos, length): Extract a substring
SPLIT_PART(string, delimeter, part_number): Split a string into different parts
Stored Procedure
- A prepared SQL code that we can save and reuse again and again.
- We can also pass parameters to a stored procedure.
| -- create the procedure
CREATE PROCEDURE SelectAllCustomers @City nvarchar(30)
AS
SELECT * FROM Customers WHERE City = @City
GO;
-- execute it
EXEC SelectAllCustomers @City = 'London';
|
CTE vs Subquery
-
CTE
- Common Table Expression
- query within a query
- create temp. tables for the duration of the main query for more readability and maintainability
-
Subquery
- aka inner queries
- embed one query within another
- to filter data within the main query
- enable granular control over the data
| subquery example |
|---|
| SELECT artist_name
FROM concerts
WHERE concert_revenue > (
SELECT AVG(concert_revenue) FROM concerts);
|
Window Functions
| SELECT e.*,
MAX(salary) OVER (
PARTITION BY dept_name
) AS rn
FROM employee e;
|
Ranking Window Functions
ROW_NUMBER()
- assigns a row number to each window
| SELECT e.*,
ROW_NUMBER() OVER(
PARTITION BY dept_name
) AS rn
FROM employee e;
|
RANK()
- assigns a rank to each window
- same number to duplicates but skips in between
| SELECT e.*,
RANK() OVER(
PARTITION BY dept_name
) AS rn
FROM employee e;
|
DENSE_RANK()
- assigns a rank to each window
- no same number to duplicates but no skips in between
| SELECT e.*,
DENSE_RANK() OVER(
PARTITION BY dept_name
ORDER BY salary DESC
) AS rn
FROM employee e;
|
LEAD()/LAG()
LEAD() and LAG() are time-series window functions used to access data from rows that come after, or before the current row within a result set based on a specific column order.
general syntaxLEAD(column_name, offset) OVER ( -- Compulsory expression
PARTITION BY partition_column -- Optional expression
ORDER BY order_column) -- Compulsory expression
LAG(column_name, offset) OVER ( -- Compulsory expression
PARTITION BY partition_column -- Optional expression
ORDER BY order_column) -- Compulsory expression
Info
The offset value lets you decide how many rows you want to inspect, whether it's in the future or the past. By default, the offset is set to 1, meaning it considers the very next or previous row. For example, if you set the offset to 1, it means you're looking 1 row ahead (future) or 1 row behind (past). If you set it to 2, you're looking 2 rows ahead or behind, and so on.
Vertical Concatenation
UNION: removes duplicates; only unique
UNION ALL: everything; including duplicates
INTERSECT: common elements
EXCEPT: find records that are unique to one set of data when we have two sets of data
Some rules:
- The no. of columns in each SELECT statement must match
- The data types of the columns must match
- The cols selected in both SELECT statements must be in the same order
SQL Order of Execution
The order is:
- FROM
- WHERE
- GROUP BY
- HAVING
- SELECT
- ORDER BY
- LIMIT/OFFSET
Pivoting
-
involves rotating a table by converting unique values from a single column into multiple columns.
-
This rearrangement turns rows into column values, often involving aggregations on remaining columns.
Pivot
| SELECT
superhero_alias,
MAX(CASE WHEN platform = 'Instagram' THEN engagement_rate END) AS instagram_engagement_rate,
MAX(CASE WHEN platform = 'Twitter' THEN engagement_rate END) AS twitter_engagement_rate,
MAX(CASE WHEN platform = 'TikTok' THEN engagement_rate END) AS tiktok_engagement_rate,
MAX(CASE WHEN platform = 'YouTube' THEN engagement_rate END) AS youtube_engagement_rate
FROM marvel_avengers
WHERE superhero_alias IN ('Iron Man', 'Captain America', 'Black Widow', 'Thor')
GROUP BY superhero_alias
ORDER BY superhero_alias;
|
Unpivot
| SELECT
superhero_alias,
platform,
CASE platform
WHEN 'Instagram' THEN engagement_rate
WHEN 'Twitter' THEN engagement_rate
WHEN 'YouTube' THEN engagement_rate
WHEN 'TikTok' THEN engagement_rate
END AS engagement_rate
FROM marvel_avengers
WHERE superhero_alias IN ('Iron Man', 'Captain America', 'Black Widow', 'Thor')
ORDER BY superhero_alias;
|
Some Important Questions
Q. https://datalemur.com/questions/updated-status
| SELECT COALESCE(advertiser.user_id, daily_pay.user_id) AS user_id,
CASE
WHEN daily_pay.paid IS NULL THEN 'CHURN'
WHEN (daily_pay.paid IS NOT NULL) AND (advertiser.status = 'CHURN') THEN 'RESURRECT'
WHEN (daily_pay.paid IS NOT NULL) AND (advertiser.status in ('EXISTING', 'RESURRECT', 'NEW')) THEN 'EXISTING'
WHEN (daily_pay.paid IS NOT NULL) AND (advertiser.status IS NULL) THEN 'NEW'
END AS new_status
FROM advertiser
FULL OUTER JOIN daily_pay
ON advertiser.user_id = daily_pay.user_id
ORDER BY user_id;
-- did COALESCE() because we did FULL OUTER JOIN, because of that there were some
-- null values in advertiser user id part as well as daily_pay user id part.
-- so we said in COALESCE() that if first is not null then take it otherwise the
-- second one
|
Q. https://datalemur.com/questions/sql-average-post-hiatus-1
| SELECT user_id, EXTRACT(DAYS FROM MAX(post_date) - MIN(post_date)) AS days_between
FROM posts
WHERE EXTRACT(YEAR FROM post_date) = '2021'
GROUP BY user_id
HAVING COUNT(user_id) >= 2 ;
|
Q. Fetch the first 2 employees from each department to join the company.
| SELECT * FROM (
SELECT e.*
ROW_NUMBER() OVER(
PARTITION BY dept_name
) AS rn
FROM employee e) x
WHERE x.rn < 3;
|
Q. https://datalemur.com/questions/sql-highest-grossing
| WITH rcte AS (SELECT
ps.category,
ps.product,
SUM(spend) AS total_spend,
ROW_NUMBER() OVER( -- if i use rank, then it may happen that two rows have same value
PARTITION BY category ORDER BY SUM(spend) DESC
) AS rnk
FROM product_spend ps
WHERE EXTRACT(YEAR FROM transaction_date) = '2022'
GROUP BY ps.category, ps.product
)
SELECT
category,
product,
total_spend
FROM rcte
WHERE rnk <= 2;
|
Q. https://datalemur.com/questions/sql-cte-subquery-concert-revenue
| WITH cte AS (
SELECT
c.artist_name,
c.genre,
c.concert_revenue,
c.number_of_members,
c.concert_revenue / c.number_of_members AS revenue_per_member,
ROW_NUMBER() OVER(
PARTITION BY genre ORDER BY c.concert_revenue / c.number_of_members DESC
) AS rnk
FROM
concerts c
)
SELECT
artist_name,
genre,
concert_revenue,
number_of_members,
concert_revenue / number_of_members AS revenue_per_member
FROM cte
WHERE cte.rnk = 1
ORDER BY revenue_per_member DESC;
|
Q. https://datalemur.com/questions/supercloud-customer
| -- SOLUTION 1
WITH res AS (
SELECT
cc.customer_id,
COUNT(DISTINCT p.product_category)
FROM customer_contracts cc
INNER JOIN products p
ON cc.product_id = p.product_id
GROUP BY cc.customer_id
)
SELECT res.customer_id
FROM res
WHERE res.count = 3; -- 3 if there are always 3, if category change, problem.
-- SOLUTION 2 (dynamic)
SELECT
customer_id
FROM customer_contracts cc
INNER JOIN products p
ON cc.product_id = p.product_id
GROUP BY cc.customer_id
HAVING COUNT(DISTINCT product_category) = (SELECT COUNT(DISTINCT product_category) FROM products) ;
|
Q. https://datalemur.com/questions/card-launch-success
| -- with cte
WITH res AS (SELECT
mci.*,
ROW_NUMBER() OVER(
PARTITION BY card_name ORDER BY issue_year
) AS rnk
FROM monthly_cards_issued mci)
SELECT
card_name,
issued_amount
FROM res
WHERE rnk = 1
ORDER BY issued_amount DESC;
-- without cte
SELECT DISTINCT
card_name,
FIRST_VALUE(issued_amount) OVER(PARTITION BY card_name ORDER BY issue_year, issue_month)
AS issued_amount
FROM monthly_cards_issued
ORDER BY issued_amount DESC
|
Q. https://datalemur.com/questions/top-fans-rank
| WITH cte AS (SELECT
artists.artist_name,
COUNT(gsr.rank) AS count_song_rank,
DENSE_RANK() OVER(
ORDER BY COUNT(gsr.rank) DESC -- can do without partition also; only ORDER BY is compulsory
) AS artist_rank
FROM artists
INNER JOIN songs
ON artists.artist_id = songs.artist_id
INNER JOIN global_song_rank AS gsr
ON songs.song_id = gsr.song_id
WHERE gsr.rank <= 10
GROUP BY artists.artist_name)
SELECT
artist_name,
artist_rank
FROM cte
WHERE artist_rank <= 5;
|
Q. https://datalemur.com/questions/histogram-users-purchases
| WITH cte AS (SELECT
user_id,
transaction_date,
COUNT(transaction_date),
RANK() OVER(
PARTITION BY user_id ORDER BY transaction_date DESC
) as rnk
FROM user_transactions
GROUP BY user_id, transaction_date)
SELECT
transaction_date,
user_id,
count AS purchase_count
FROM cte
WHERE rnk = 1
ORDER BY transaction_date ASC;
|
Q.https://datalemur.com/questions/odd-even-measurements
| WITH cte AS (
SELECT
DATE_TRUNC('day', measurement_time) AS measurement_day, -- date trunc rounds to the beginning of the day
measurement_value,
ROW_NUMBER() OVER(
PARTITION BY EXTRACT(DAY FROM measurement_time)
ORDER BY measurement_time -- we need the rank in the ascending order of the time
) as rnk
FROM measurements)
SELECT
measurement_day,
SUM(measurement_value) FILTER (WHERE rnk % 2 != 0) AS odd_sum, -- new thing: filter
SUM(measurement_value) FILTER (WHERE rnk % 2 = 0) AS even_sum
FROM cte
GROUP BY measurement_day
ORDER BY measurement_day;
|
Q. https://datalemur.com/questions/yoy-growth-rate
| WITH cte AS (SELECT
ut.product_id,
EXTRACT(YEAR FROM ut.transaction_date) AS yr,
SUM(ut.spend) OVER(
PARTITION BY ut.product_id, EXTRACT(YEAR FROM ut.transaction_date)
) AS curr_year_spend
FROM user_transactions ut)
SELECT
yr AS year,
product_id,
curr_year_spend,
LAG(curr_year_spend) OVER(
PARTITION BY product_id ORDER BY yr
) AS prev_year_spend,
ROUND(
(
(curr_year_spend - LAG(curr_year_spend) OVER(
PARTITION BY product_id ORDER BY yr
)
) / LAG(curr_year_spend) OVER(
PARTITION BY product_id ORDER BY yr
)
) * 100, 2) AS yoy_rate
FROM cte;
|
Q. https://datalemur.com/questions/total-drugs-sales
| SELECT
manufacturer,
CONCAT('$', CAST(ROUND(SUM(total_sales)/1000000,0) AS varchar), ' million') AS sale
FROM pharmacy_sales
GROUP BY manufacturer -- group by cannot have aggregate function
ORDER BY SUM(total_sales) DESC;
|
Q. https://datalemur.com/questions/sql-histogram-tweets
| WITH cte AS
(
SELECT
user_id,
COUNT(tweet_id) AS tweet_bucket
FROM tweets
WHERE EXTRACT(YEAR FROM tweet_date) = '2022'
GROUP BY user_id)
SELECT
tweet_bucket,
COUNT(user_id) AS users_num
FROM cte
GROUP BY tweet_bucket;
|
Q. https://datalemur.com/questions/duplicate-job-listings
| WITH cte AS (
SELECT
company_id,
title,
description,
COUNT(job_id) AS job_count
FROM job_listings
GROUP BY company_id, title, description
)
SELECT
COUNT(company_id) AS duplicate_companies
FROM cte
WHERE job_count > 1;
|
Q. https://datalemur.com/questions/second-day-confirmation
| SELECT
e.user_id
FROM emails e
JOIN texts t
ON e.email_id = t.email_id
WHERE EXTRACT(DAY FROM t.action_date - e.signup_date) = 1 AND signup_action = 'Confirmed';
|
Q. https://datalemur.com/questions/sql-ibm-db2-product-analytics
| WITH cte AS (
SELECT
COALESCE(COUNT(DISTINCT q.query_id), 0) AS unique_queries, -- coalesce for making sure null is counted as 0
e.employee_id
FROM employees AS e
LEFT JOIN queries AS q
ON e.employee_id = q.employee_id
AND EXTRACT(YEAR FROM q.query_starttime) = '2023'
AND EXTRACT(MONTH FROM q.query_starttime) >= 7
AND EXTRACT(MONTH FROM q.query_starttime) <= 9
GROUP BY e.employee_id)
SELECT
unique_queries,
COUNT(employee_id) AS employee_count
FROM cte
GROUP BY unique_queries
ORDER BY unique_queries;
|
Tip
Integer division results in integer result. May need to cast to decimal type ie ::DECIMAL.
q. https://datalemur.com/questions/alibaba-compressed-mean
| SELECT
ROUND(SUM(item_count::DECIMAL * order_occurrences) / SUM(order_occurrences), 1)
AS mean
FROM items_per_order;
|
Important
Remember aggregate functions are not allowed in WHERE clause. So, if any aggregate is required, try to think of another way like this:
q. https://datalemur.com/questions/non-profitable-drugs
| SELECT
manufacturer,
COUNT(drug) AS drug_count,
ABS(SUM(total_sales - cogs)) AS total_loss
FROM pharmacy_sales
WHERE total_sales - cogs <= 0
GROUP BY manufacturer
ORDER BY ABS(SUM(total_sales - cogs)) DESC;
|
And not this:
| SELECT
manufacturer,
COUNT(drug),
SUM(total_sales) - SUM(cogs) As total_loss
FROM pharmacy_sales
WHERE SUM(total_sales) - SUM(cogs) < 0
GROUP BY manufacturer;
|
q. https://datalemur.com/questions/oracle-sales-quota
| WITH cte AS ( -- filtered before hand
SELECT
employee_id,
SUM(deal_size) AS deal_size
FROM deals
GROUP BY employee_id)
SELECT
d.employee_id,
CASE
WHEN deal_size - quota > 0 THEN 'yes'
ELSE 'no'
END AS made_quota
FROM cte d
INNER JOIN sales_quotas sq
ON d.employee_id = sq.employee_id
ORDER BY employee_id;
|