I took a bunch of the above suggestions and after some moderate swearing have something for bigquery
few notes:
- kept the schema arg. I want to be able to pass in the dbt-managed list of datasets (the arg is still named schemas cause I like snowflake better )
- instead of BRONZE schema check, added a donot_drop_list seed w/just SCHEMA name and TABLE name columns b/c had a client once with some non-dbt tables sprinkled in their dbt managed schemas. uh. donāt do this. butā¦ if already there then need a way to skip them
- stale_days - added an argument so can query/drop tables that havenāt been touched in a specific number of days (I couldnāt find ālast_alteredā type info for views in BQ. anyone know where thatād be ?)
- added a few extra details (row count/last altered) to output b/c I found this info helpful when a client thatās been around a while had a lot of orphaned objects to review.
- schema name case could be an issue depending on your settings if you name tHingS_WeIRd
{% macro drop_orphaned_tables_bigquery(schema, stale_days = 60, dry_run = true) %}
ā
{% if execute %}
{% if (schema is not string and schema is not iterable) or schema is mapping or schema|length <= 0 %}
{% do exceptions.raise_compiler_error('"schema" must be a string or a list') %}
{% endif %}
{% if schema is string %}
{% set schema = [schema] %}
{% endif %}
ā
{% call statement('get_outdated_tables', fetch_result=True) %}
select c.schema_name,
c.ref_name,
c.ref_type,
c.last_altered,
c.row_count
from (
select dataset_id as schema_name,
table_id as ref_name,
'table' as ref_type,
cast(TIMESTAMP_MILLIS(last_modified_time) as string) as last_altered,
cast(row_count as string) as row_count
from (
{%- for s in schema %}
select * from {{ s }}.__TABLES__
{% if not loop.last %}union all {% endif %}
{%- endfor -%}
)
where TIMESTAMP_MILLIS(last_modified_time) < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {{stale_days}} DAY)
and dataset_id||table_id not in (select table_schema||table_name from {{ref('donot_drop_list')}})
union all
select table_schema as schema_name,
table_name as ref_name,
'view' as ref_type,
'NA' as last_altered, --can't find last modified date in BQ for views
'NA' as row_count
from (
{%- for s in schema %}
select * from {{ s }}.INFORMATION_SCHEMA.VIEWS
{% if not loop.last %}union all {% endif %}
{%- endfor -%}
)
where table_schema||table_name not in (select table_schema||table_name from {{ref('donot_drop_list')}})
) as c
left join ( select * FROM UNNEST([STRUCT<schema_name STRING, ref_name STRING>
{%- for node in graph['nodes'].values() | selectattr("resource_type", "equalto", "model") | list
+ graph['nodes'].values() | selectattr("resource_type", "equalto", "seed") | list
+ graph['nodes'].values() | selectattr("resource_type", "equalto", "snapshot") | list%}
{%- if (node.schema in schema) and node.alias and node.alias != None %}
('{{ node.schema }}', '{{node.alias}}'),
{%- endif -%}
{%- if loop.last %} {# this is to avoid the issue of the last node in graph having alias = 'None' or being in a different schema causing compile error due to the "," if node.alias is None here it doesn't really hurt anything#}
('{{ node.schema }}','{{node.alias}}')
])
{%- endif %}
{%- endfor %}
) as desired on desired.schema_name = c.schema_name
and desired.ref_name = c.ref_name
where desired.ref_name is null
{% endcall %}
ā
{%- for to_delete in load_result('get_outdated_tables')['data'] %}
{% set fqn = target.database + '.' + to_delete[0] + '.' + to_delete[1] %}
{% if dry_run == false %}
{% call statement() -%}
{% do log('dropping ' ~ to_delete[2] ~ ': ' ~ fqn, info=true) %}
drop {{ to_delete[2] }} if exists {{ fqn }};
{%- endcall %}
{% elif dry_run == true %}
{% do log( 'drop '~to_delete[2] ~ ' if exists ' ~ fqn ~'; -- last_altered:'~ to_delete[3].strftime("%d-%b-%Y %H:%M") ~'; row_count:'~ to_delete[4], info=true) %}
{% endif %}
{%- if loop.last and dry_run == true %}
{% do log('Please drop the above objects or move them to the proper backup schema.\n', info=true) %}
{% endif %}
{%- endfor %}
ā
{%- set response = load_result('get_outdated_tables')['response'] %}
{% do log('Query Status + # of results found: '~ response, info=true) %}
ā
{% endif %}
{% endmacro %}