FAQ: Cleaning up removed models from your production schema

I took a bunch of the above suggestions and after some moderate swearing :melting_face: have something for bigquery

few notes:

  1. kept the schema arg. I want to be able to pass in the dbt-managed list of datasets (the arg is still named schemas cause I like snowflake better :smirk: )
  2. instead of BRONZE schema check, added a donot_drop_list seed w/just SCHEMA name and TABLE name columns b/c had a client once with some non-dbt tables sprinkled in their dbt managed schemas. uh. donā€™t do this. butā€¦ if already there then need a way to skip them
  3. stale_days - added an argument so can query/drop tables that havenā€™t been touched in a specific number of days (I couldnā€™t find ā€œlast_alteredā€ type info for views in BQ. anyone know where thatā€™d be ?)
  4. added a few extra details (row count/last altered) to output b/c I found this info helpful when a client thatā€™s been around a while had a lot of orphaned objects to review.
  5. schema name case could be an issue depending on your settings if you name tHingS_WeIRd

{% macro drop_orphaned_tables_bigquery(schema, stale_days = 60, dry_run = true) %} 
ā€‹
{% if execute %}
  {% if (schema is not string and schema is not iterable) or schema is mapping or schema|length <= 0 %}
    {% do exceptions.raise_compiler_error('"schema" must be a string or a list') %}
  {% endif %}
  {% if schema is string %}
  {% set schema = [schema] %}
  {% endif %}
ā€‹
  {% call statement('get_outdated_tables', fetch_result=True) %}
    select c.schema_name,
           c.ref_name,
           c.ref_type,
           c.last_altered,
           c.row_count
    from (
        select dataset_id as schema_name,
               table_id as ref_name,
               'table' as ref_type,
               cast(TIMESTAMP_MILLIS(last_modified_time) as string) as last_altered,
               cast(row_count as string) as row_count
      from (

        {%- for s in schema %}
         select * from {{ s }}.__TABLES__
        {% if not loop.last %}union all {% endif %}
        {%- endfor -%}
      )
      where TIMESTAMP_MILLIS(last_modified_time) < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL {{stale_days}} DAY)
        and dataset_id||table_id not in (select table_schema||table_name from {{ref('donot_drop_list')}})
      
    union all
    select table_schema as schema_name, 
           table_name  as ref_name, 
           'view'     as ref_type,
           'NA' as last_altered, --can't find last modified date in BQ for views 
           'NA'  as row_count
      from (
        {%- for s in schema %}
         select * from {{ s }}.INFORMATION_SCHEMA.VIEWS
        {% if not loop.last %}union all {% endif %}
        {%- endfor -%}
      )
        where  table_schema||table_name not in (select table_schema||table_name from {{ref('donot_drop_list')}})
      ) as c
    left join ( select * FROM UNNEST([STRUCT<schema_name STRING, ref_name STRING>
      {%- for node in graph['nodes'].values() | selectattr("resource_type", "equalto", "model") | list
                    + graph['nodes'].values() | selectattr("resource_type", "equalto", "seed")  | list 
                    + graph['nodes'].values() | selectattr("resource_type", "equalto", "snapshot")  | list%} 
            {%- if (node.schema in schema) and node.alias and node.alias != None %}
                ('{{ node.schema }}', '{{node.alias}}'),
            {%- endif -%}
            {%- if loop.last %}  {# this is to avoid the issue of the last node in graph having alias = 'None' or being in a different schema causing compile error due to the "," if node.alias is None here it doesn't really hurt anything#}
                ('{{ node.schema }}','{{node.alias}}')
                ])
            {%- endif %}
      {%- endfor %}
    ) as desired on desired.schema_name = c.schema_name
                 and desired.ref_name    = c.ref_name
    where desired.ref_name is null
  {% endcall %}
ā€‹
  {%- for to_delete in load_result('get_outdated_tables')['data'] %} 
    {% set fqn = target.database + '.' + to_delete[0] + '.' + to_delete[1] %}
    {% if dry_run == false %}
    {% call statement() -%}
      {% do log('dropping ' ~ to_delete[2] ~ ': ' ~ fqn, info=true) %}
      drop {{ to_delete[2] }} if exists {{ fqn }};
    {%- endcall %}
    {% elif dry_run == true %}
    {% do log( 'drop '~to_delete[2] ~ ' if exists ' ~ fqn ~'; -- last_altered:'~ to_delete[3].strftime("%d-%b-%Y %H:%M") ~'; row_count:'~ to_delete[4], info=true) %}
    {% endif %}
    {%- if loop.last and dry_run == true %}
        {% do log('Please drop the above objects or move them to the proper backup schema.\n', info=true) %}
    {% endif %}
  {%- endfor %}
ā€‹
   {%- set response = load_result('get_outdated_tables')['response']  %}
   {% do log('Query Status + # of results found: '~ response, info=true) %}
ā€‹
    {% endif %}
{% endmacro %}