Automation of UPDATE of column-level statistics on Azure SQL Data Warehouse - azure-sqldw

I am planning to automate the refresh (i.e., UPDATE) of column level statistics on my Azure SQL Data Warehouse databases. I plan to log the operations in a site-specific table and then dynamically generate SQL to refresh the statistics using the following approach:
DATE columns refreshed daily,
ID/code primary/foreign-key columns refreshed bi-weekly,
Indicator/boolean columns refreshed monthly, and
QTY/AMT (fact) columns refreshed quarterly.
I reviewed the STATS_DATE function at https://msdn.microsoft.com/library/ms190330.aspx, but this function does not appear to support the detail necessary for column-level statistics. For example the output for one of my tables that has three columns which had statistics collected shows NULL for STATS_DATE:
SELECT
s.object_id,
s.name,
s.stats_id,
s.user_created,
STATS_DATE(object_id, stats_id) AS statistics_date
FROM sys.stats s
where object_id = 107141;
Returns
object_id name stats_id user_created statistics_date
107,141 MySchema_MyTable_Col1 2 1 [NULL]
107,141 MySchema_MyTable_Col2 3 1 [NULL]
107,141 MySchema_MyTable_Col3 4 1 [NULL]
Did I overlook or misunderstand this function and should I be able to use STATS_DATE to manage my columns' statistics?
Following is a more complete demonstration:
--Create a columnar demonstration table
create table My_Schema.steve_test_table_columnar (c1_c integer, c2_c smallint, c3_c date, c4_c decimal(18,2) ) ;
--Create a heap demonstration table
create table My_Schema.steve_test_table_heap (c1_h integer, c2_h smallint, c3_h date, c4_h decimal(18,2) ) with (HEAP) ;
-CREATE STATISTICS statements:
CREATE STATISTICS My_Schema_STEVE_TEST_TABLE_COLUMNAR_C1_C ON My_Schema.STEVE_TEST_TABLE_COLUMNAR ( C1_C ) ;
CREATE STATISTICS My_Schema_STEVE_TEST_TABLE_COLUMNAR_C2_C ON My_Schema.STEVE_TEST_TABLE_COLUMNAR ( C2_C ) ;
CREATE STATISTICS My_Schema_STEVE_TEST_TABLE_COLUMNAR_C3_C ON My_Schema.STEVE_TEST_TABLE_COLUMNAR ( C3_C ) ;
CREATE STATISTICS My_Schema_STEVE_TEST_TABLE_COLUMNAR_C4_C ON My_Schema.STEVE_TEST_TABLE_COLUMNAR ( C4_C ) ;
CREATE STATISTICS My_Schema_STEVE_TEST_TABLE_HEAP_C1_H ON My_Schema.STEVE_TEST_TABLE_HEAP ( C1_H ) ;
CREATE STATISTICS My_Schema_STEVE_TEST_TABLE_HEAP_C2_H ON My_Schema.STEVE_TEST_TABLE_HEAP ( C2_H ) ;
CREATE STATISTICS My_Schema_STEVE_TEST_TABLE_HEAP_C3_H ON My_Schema.STEVE_TEST_TABLE_HEAP ( C3_H ) ;
CREATE STATISTICS My_Schema_STEVE_TEST_TABLE_HEAP_C4_H ON My_Schema.STEVE_TEST_TABLE_HEAP ( C4_H ) ;
--UPDATE (aka "REFRESH") STATISTICS statements:
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_COLUMNAR ( My_Schema_STEVE_TEST_TABLE_COLUMNAR_C3_C ) ;
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_HEAP ( My_Schema_STEVE_TEST_TABLE_HEAP_C3_H ) ;
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_COLUMNAR ( My_Schema_STEVE_TEST_TABLE_COLUMNAR_C2_C ) ;
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_HEAP ( My_Schema_STEVE_TEST_TABLE_HEAP_C2_H ) ;
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_COLUMNAR ( My_Schema_STEVE_TEST_TABLE_COLUMNAR_C1_C ) ;
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_COLUMNAR ( My_Schema_STEVE_TEST_TABLE_COLUMNAR_C1_C ) ;
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_HEAP ( My_Schema_STEVE_TEST_TABLE_HEAP_C1_H ) ;
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_COLUMNAR ( My_Schema_STEVE_TEST_TABLE_COLUMNAR_C4_C ) ;
UPDATE STATISTICS My_Schema.STEVE_TEST_TABLE_HEAP ( My_Schema_STEVE_TEST_TABLE_HEAP_C4_H ) ;
--Validation Steps
select s.[schema_id] , s.[name] from sys.[schemas] s where s.[name] = 'My_Schema';
--Results:
schema_id name
24 My_Schema
--Get Table Object ID
select t.[object_id] , t.[name] from sys.[tables] t
inner join sys.[schemas] s
on t.[schema_id] = s.[schema_id]
where s.[name] = 'My_Schema' and t.[name] in ('STEVE_TEST_TABLE_HEAP' , 'STEVE_TEST_TABLE_COLUMNAR');
--Results:
object_id name
516,196,889 steve_test_table_columnar
532,196,946 steve_test_table_heap
--Get Columnd IDs
select t.[object_id] , c.[column_id], t.[name] , c.[name] as Column_Name
from
sys.[tables] t
inner join
sys.[schemas] s
on
t.[schema_id] = s.[schema_id]
INNER JOIN
sys.[columns] c
ON
t.[object_id] = c.[object_id]
where
s.[name] = 'My_Schema'
and t.[name] in ('STEVE_TEST_TABLE_HEAP' , 'STEVE_TEST_TABLE_COLUMNAR')
--Results:
object_id column_id name Column_Name
516,196,889 1 steve_test_table_columnar c1_c
516,196,889 2 steve_test_table_columnar c2_c
516,196,889 3 steve_test_table_columnar c3_c
516,196,889 4 steve_test_table_columnar c4_c
532,196,946 1 steve_test_table_heap c1_h
532,196,946 2 steve_test_table_heap c2_h
532,196,946 3 steve_test_table_heap c3_h
532,196,946 4 steve_test_table_heap c4_h
--Final review of statistics metadata
select t.[object_id] , c.[column_id], t.[name] as table_name
, c.[name] as Column_Name ,st.stats_id , st.name as Stats_Name
,stc.stats_column_id
,STATS_DATE(st.object_id, st.stats_id) AS statistics_date
from
sys.[tables] t
inner join
sys.[schemas] s
on
t.[schema_id] = s.[schema_id]
INNER JOIN
sys.[columns] c
ON
t.[object_id] = c.[object_id]
INNER JOIN
sys.stats st
ON
st.[object_id] = t.[object_id]
and user_created = 1
INNER JOIN
sys.[stats_columns] stc
on
st.stats_id = stc.stats_id
and st.[object_id] = stc.[object_id]
and c.[column_id] = stc.[column_id]
where
s.[name] = 'My_Schema'
and t.[name] in ('STEVE_TEST_TABLE_HEAP' , 'STEVE_TEST_TABLE_COLUMNAR')
;
object_id column_id table_name Column_Name stats_id Stats_Name stats_column_id statistics_date
516,196,889 1 steve_test_table_columnar c1_c 2 My_Schema_STEVE_TEST_TABLE_COLUMNAR_C1_C 1 [NULL]
516,196,889 2 steve_test_table_columnar c2_c 3 My_Schema_STEVE_TEST_TABLE_COLUMNAR_C2_C 1 [NULL]
516,196,889 3 steve_test_table_columnar c3_c 4 My_Schema_STEVE_TEST_TABLE_COLUMNAR_C3_C 1 [NULL]
516,196,889 4 steve_test_table_columnar c4_c 5 My_Schema_STEVE_TEST_TABLE_COLUMNAR_C4_C 1 [NULL]
532,196,946 1 steve_test_table_heap c1_h 2 My_Schema_STEVE_TEST_TABLE_HEAP_C1_H 1 [NULL]
532,196,946 2 steve_test_table_heap c2_h 3 My_Schema_STEVE_TEST_TABLE_HEAP_C2_H 1 [NULL]
532,196,946 3 steve_test_table_heap c3_h 4 My_Schema_STEVE_TEST_TABLE_HEAP_C3_H 1 [NULL]
532,196,946 4 steve_test_table_heap c4_h 5 My_Schema_STEVE_TEST_TABLE_HEAP_C4_H 1 [NULL]

I confirmed that if the tables are loaded, the return value of STATS_DATE(id,id) is not null. My experiment had involved only creating the tables.

Related

RANKX in measures

I have a table with 2 columns: client and product_name.
I need to number the product_name for each client
client
product_name
rank
1
aaa
1
1
baa
2
1
cwe
3
2
te
1
3
aaa
1
3
cwq
2
I created a column
RANKX_column =
RANKX(
FILTER(Query1,Query1[client_id] = EARLIER(Query1[client_id])),
Query1[product_id],,ASC,Dense
)
but if I apply a filter, the rank is not recalculated.
I tried to rewrite this formula for measure, but it returns an error about the function EARLIER.
Try something like:
=
VAR ThisClientID =
MIN( Query1[client_id] )
RETURN
RANKX(
FILTER( ALL( Query1 ), Query1[client_id] = ThisClientID ),
CALCULATE( MIN( Query1[product_id] ) ),
,
ASC,
DENSE
)
It is better to do it using Power Query:
You need first to group by columns [Client] -- See picture:
Resulting Table:
Then create a custom column, Here is the M Code:
Table.AddIndexColumn([AllData],"Rank",1,1)
Resulting Table:
Then Combine (Union) the all tables using The M Code:
= Table.Combine(#"Added Custom"[Rank])
Then close & apply. Return to your Main menu:
If you check your data view , The table there seems:

DAX column count latest record for each set of group with a condition

I want to get Latest updated record which is bit tricky to retrieve using DAX column with power bi
Count -> Order Count based on Modified On(Datetime) with Ascending Order
Deleted -> a Flag set to be True for deleted record
Id
Name
Modified On
Deleted
Count
Result
1
Charles
09-11-2022 15:09:40
1
1
09-11-2022 15:46:33
True
2
1
Charles M
09-11-2022 20:39:40
3
True
2
Charles
09-11-2022 15:09:40
1
2
09-11-2022 15:46:33
True
2
2
Charles M
09-11-2022 20:39:40
3
2
09-11-2022 21:16:33
True
4
2
charles m
09-11-2022 21:18:33
5
3
Dani
09-11-2022 15:46:33
1
True
3
09-11-2022 21:16:33
True
2
4
George
09-11-2022 15:46:33
1
4
George K
09-11-2022 21:16:33
2
In the above example I wanted the Result column values as it is on above table.
explanation:
Here Id : 1, The record is two times created as well as deleted so the history of record will have four rows. I wanted the last updated record which is the 3rd row and not the last record because the is Deleted flag is set to be True so there is no Name on it.
as so on for the second but Id:2
If the last insert on the record is not deleted then whole result column of the id should not return anything
(Id: 3)
In the third set there is there is no update on the record with this history table. the first row is created and second is for the deletion. so we should have to retrieve the first record which only have that data on Name field
Id: 4
There is no deletion operation happened so we don't want to get that record. the result columns should be empty
Thanks in advance
I have tried to get the latest record with
LatestDeletedRecord =
VAR latest = CALCULATE(MAX('Table'[Column3]), ALLEXCEPT('Table','Table'[Id]))
RETURN IF('Table'[Column3] = latest && 'Table'[IsDeleted] = True,True)
Other than nothing I could, I am new to DAX calculations
Edit: If your requirements change, you should perhaps post a new question instead of editing your existing question :-)
With your altered requirements you can use this calculated column:
Result =
VAR _max =
CALCULATE (
MAX ( 'Table'[Modified On] ) ,
ALLEXCEPT ( 'Table' , 'Table'[Id] )
)
VAR _max_is_deleted =
CALCULATE (
SELECTEDVALUE ( 'Table'[Deleted] ) ,
ALLEXCEPT ( 'Table' , 'Table'[Id] ) ,
'Table'[Modified On] = _max
)
VAR _max_mod =
// Calculate the maximum modified date where name is not deleted
CALCULATE (
MAX ( 'Table'[Modified On] ) ,
ALLEXCEPT ( 'Table' , 'Table'[Id] ) ,
'Table'[Name] <> ""
)
RETURN
IF (
// For rows where ID has an associated deletion AND modified is max with name
_max_is_deleted
&& [Modified On] = _max_mod,
// Return "True"
"True"
)
Gives your desired result:

DAX - Retrieve a value from another unrelated table

I have two tables: 'Events' and 'Occurrences'.
In Events, I have the name of the event, start date and end date.
In 'Occurrences' I have a date from occurrence, ID occurrence and description of occurrence.
Events Table
Event
Start Date
End Date
Event 1
01/01/2022
02/14/2022
Event 2
02/15/2022
03/10/2022
Event 3
02/11/2022
03/30/2022
Occurrence Table
ID Ocurrence
Occurrence Create Date
Description
1
01/10/2022
Foo 1
2
02/11/2022
Foo 2
3
02/20/2022
Foo 3
4
03/20/2022
Foo 4
5
03/30/2022
Foo 5
My Question is: How can I retrieve which event is each occurrence?
In this example, the expected result is:
ID Ocurrence
Occurrence Create Date
Description
Event Related
1
01/10/2022
Foo 1
Event 1
2
02/11/2022
Foo 2
Event 1
3
02/20/2022
Foo 3
Event 2
4
03/20/2022
Foo 4
Event 3
5
03/30/2022
Foo 5
Event 3
add this measure to your table...
Event Related =
VAR _occ =
SELECTEDVALUE ( 'Table (2)'[Occurrence Create Date] )
RETURN
CALCULATE (
FIRSTNONBLANK ( 'Table'[Event], 1 ),
FILTER (
ALL ( 'Table' ),
'Table'[Start Date] <= _occ
&& 'Table'[End Date] >= _occ
)
)
or if you want to add only as a column to the table, not the visual, you can use this calculated column (more or less the same)
Event Related 2 =
var _occ = 'Table (2)'[Occurrence Create Date]
return
CALCULATE (
FIRSTNONBLANK ( 'Table'[Event], 1 ),
FILTER (
ALL ( 'Table' ),
'Table'[Start Date] <= _occ
&& 'Table'[End Date] >= _occ
)
)

Count of active employees between two date columns, by department

I'm trying to imitate this report (page 3) where it slices active headcount and all the other metrics (1) by date and (2) by department.
My data looks like this (with relationships, of course):
ID
Name
DEPID
Hired Date
Terminated Date
Terminated (Y/N)
1
John
2
1/1/2019
2020/12/31
Y
2
Jane
2
1/3/2018
2019/07/26
Y
3
Jack
1
1/5/2022
null
N
Using the following measure, I was able to extract total number of employees by date, but I wasn't able to filter by department:
CountOfActive =
var _selectedDate = MAX('Calendar'[Date])
return
CALCULATE(COUNTROWS('Table'); filter(ALL('Table'); Table[HIREDDATE] <= VALUE(_selectedDate) && (Table[TERMINATEDDATE] >= VALUE(_selectedDate) || ISBLANK(Table[TERMINATEDDATE]))))
My ideal output is something like the following (where I'll create a table for each department and list the number of active employees, then join them to my department key table afterwards so I can slice them):
Date
Count of Active Employees
Department
2019/1/1
3
Retail
2019/1/2
3
Retail
2019/1/3
4
Retail
...
...
...
The "Date" column would be a calendar table built with CALENDAR().
What should I do to achieve the last table based on the data I have?
My relationship schema looks like this.
try this : 'Table 2' is your Calendar Table which is also a slicer on the visual.
Make sure that your Calendar Table's Date has a relation with the Hired Date and also the relation between the Department Table
Count of Emp =
VAR _latest =
MAX ( 'Table 2'[Date] )
VAR _from =
MIN ( 'Table 2'[Date] )
VAR _dept =
SELECTEDVALUE ( Department[Department] )
RETURN
CALCULATE (
COUNTX ( 'Table', 'Table'[ID ] ),
FILTER (
ALL ( 'Table' ),
'Table'[Terminated Date ] >= _from
&& 'Table'[Hired Date ] <= _from
&& 'Table'[Terminated (Y/N)] = "Y"
&& RELATED ( Department[Department] ) = _dept
)
)
+ CALCULATE (
COUNTX ( 'Table', 'Table'[ID ] ),
FILTER (
ALL ( 'Table' ),
'Table'[Terminated (Y/N)] = "N"
&& 'Table'[Hired Date ] <= _from
&& RELATED ( Department[Department] ) = _dept
)
)

SQL for nested WITH CLAUSE - RESULTS OFFSET in Oracle 19c

Please suggest a way to implement nesting of (temp - results - select) as shown below?
I see that oracle 19c does not allow nesting of WITH clause.
with temp2 as
(
with temp1 as
(
__
__
),
results(..fields..) as
(
select ..<calc part>.. from temp1, results where __
)
select ..<calc part>.. from temp1 join results where __
),
results(..fields..) as
(
select ..<calc part>.. from temp2, results where __
)
select ..<calc part>.. from temp2 join results where __
For instance:
DB Fiddle
I need to calculate CALC3 in similar recursive way as of CALC
CREATE TABLE TEST ( DT DATE, NAME VARCHAR2(10), VALUE NUMBER(10,3));
insert into TEST values ( to_date( '01-jan-2021'), 'apple', 198.95 );
insert into TEST values ( to_date( '02-jan-2021'), 'apple', 6.15 );
insert into TEST values ( to_date( '03-jan-2021'), 'apple', 4.65 );
insert into TEST values ( to_date( '06-jan-2021'), 'apple', 20.85 );
insert into TEST values ( to_date( '01-jan-2021'), 'banana', 80.5 );
insert into TEST values ( to_date( '02-jan-2021'), 'banana', 9.5 );
insert into TEST values ( to_date( '03-jan-2021'), 'banana', 31.65 );
--Existing working code -
with t as
( select
test.*,
row_number() over ( partition by name order by dt ) as seq
from test
),
results(name, dt, value, calc ,seq) as
(
select name, dt, value, value/5 calc, seq
from t
where seq = 1
union all
select t.name, t.dt, t.value, ( 4 * results.calc + t.value ) / 5, t.seq
from t, results
where t.seq - 1 = results.seq
and t.name = results.name
)
select results.*, calc*3 as calc2 -- Some xyz complex logic as calc2
from results
order by name, seq;
Desired output:
CALC3 - grouped by name and dt -
((CALC3 of prev day record * 4) + CALC2 of current record )/ 5
i.e for APPLE
for 1-jan-21, CALC = ((0*4)+119.37)/5 = 23.87 -------> since it is 1st record, have taken 0 as CALC3 of prev day record
for 2-jan-21, CALC = ((23.87*4)+99.19)/5= 115.33 -----> prev CALC3 is considered from 1-jan-21 - 23.87 and 99.19 from current row
for 3-jan-21, CALC = ((115.33*4)+82.14)/5= 477.76 and so on
For BANANA
1-jan-21, CALC = ((0*4)+48.30)/5=9.66
1-jan-21, CALC = ((9.66*4)+44.34)/5=47.51
etc
You do not need to, you can just do it all in one level:
with temp1(...fields...) as
(
__
__
__
),
results1(...fields...) as
(
select ...<calc part>... from temp1 where __
),
temp2( ...fields...) as
(
select ...<calc part>... from temp1 join results1 where __
),
results2(...fields...) as
(
select ...<calc part>... from temp2 where __
)
select ...<calc part>... from temp2 join results2 where __
For your actual problem, you can use a MODEL clause:
SELECT dt,
name,
amount,
calc,
seq,
calc2,
calc3
FROM (
SELECT t.*,
ROW_NUMBER() OVER (PARTITION BY name ORDER BY dt) AS seq
FROM test t
)
MODEL
PARTITION BY (name)
DIMENSION BY (seq)
MEASURES ( dt, amount, 0 AS calc, 0 AS calc2, 0 as calc3)
RULES (
calc[1] = amount[1]/5,
calc[seq>1] = (amount[cv(seq)] + 4*calc[cv(seq)-1])/5,
calc2[seq] = 3*calc[cv(seq)],
calc3[1] = calc2[1]/5,
calc3[seq>1] = (calc2[cv(seq)] + 4*calc3[cv(seq)-1])/5
)
Which outputs:
DT
NAME
AMOUNT
CALC
SEQ
CALC2
CALC3
01-JAN-21
banana
80.5
16.1
1
48.3
9.66
02-JAN-21
banana
9.5
14.78
2
44.34
16.596
03-JAN-21
banana
31.65
18.154
3
54.462
24.1692
01-JAN-21
apple
198.95
39.79
1
119.37
23.874
02-JAN-21
apple
6.15
33.062
2
99.186
38.9364
03-JAN-21
apple
4.65
27.3796
3
82.1388
47.57688
06-JAN-21
apple
20.85
26.07368
4
78.22104
53.705712
db<>fiddle here