DQ dupes cleanup/investigation

Assign
Data Source
Priority
Status
IceBox
Description
Estimated Completion
Tags
Property
Current results:
notion image
 
Above based on:
-- discord_messages check
select
 author_user_id,
 content,
 timestamp,
 count(1) as row_cnt
from discord_messages
group by 1,2,3
having count(*) > 1;

-- detailed info on the above
select
 row_number() over (
   partition by dm.author_user_id, dm.content, dm.timestamp
   order by dm.discord_message_id desc
  ) as rn, 
 dm.*
from discord_messages dm
 join (
   select author_user_id, content, timestamp
   from discord_messages
   group by 1,2,3
   having count(*) > 1
  ) t on dm.author_user_id = t.author_user_id
     and coalesce(dm.content, '') = coalesce(t.content, '')
     and dm.timestamp = t.timestamp
limit 100;
In majority of cases ^ content = NULL so correlated with DQ Blank Data findings
 
-- stg_subgraph_bank_1 check
select
 from_address,
 to_address,
 amount_display,
 tx_timestamp,
 count(1) as row_cnt
from stg_subgraph_bank_1
group by 1,2,3,4
having count(*) > 1;
Looking at these ^ though - looks like they might be multisig txns sent to the same address:
notion image
False positive?
 
Suggested actions
  • delete dupes for discord messages based on the check above? if agreed:
delete from discord_messages
where discord_message_id in (
  select m.discord_message_id
  from discord_messages m
   join (
    select
     row_number() over (
       partition by dm.author_user_id, dm.content, dm.timestamp
       order by dm.discord_message_id desc
      ) as rn, 
     dm.discord_message_id
    from discord_messages dm
     join (
       select author_user_id, content, timestamp
       from discord_messages
       group by 1,2,3
       having count(*) > 1
      ) t on dm.author_user_id = t.author_user_id
       and coalesce(dm.content, '') = coalesce(t.content, '')
       and dm.timestamp = t.timestamp
    ) dupes on m.discord_message_id = dupes.discord_message_id
  where dupes.rn > 1
);

DQ dupes cleanup/investigation

Assign
Data Source
Priority
Status
IceBox
Description
Estimated Completion
Tags
Property
Current results:
notion image
 
Above based on:
-- discord_messages check
select
 author_user_id,
 content,
 timestamp,
 count(1) as row_cnt
from discord_messages
group by 1,2,3
having count(*) > 1;

-- detailed info on the above
select
 row_number() over (
   partition by dm.author_user_id, dm.content, dm.timestamp
   order by dm.discord_message_id desc
  ) as rn, 
 dm.*
from discord_messages dm
 join (
   select author_user_id, content, timestamp
   from discord_messages
   group by 1,2,3
   having count(*) > 1
  ) t on dm.author_user_id = t.author_user_id
     and coalesce(dm.content, '') = coalesce(t.content, '')
     and dm.timestamp = t.timestamp
limit 100;
In majority of cases ^ content = NULL so correlated with DQ Blank Data findings
 
-- stg_subgraph_bank_1 check
select
 from_address,
 to_address,
 amount_display,
 tx_timestamp,
 count(1) as row_cnt
from stg_subgraph_bank_1
group by 1,2,3,4
having count(*) > 1;
Looking at these ^ though - looks like they might be multisig txns sent to the same address:
notion image
False positive?
 
Suggested actions
  • delete dupes for discord messages based on the check above? if agreed:
delete from discord_messages
where discord_message_id in (
  select m.discord_message_id
  from discord_messages m
   join (
    select
     row_number() over (
       partition by dm.author_user_id, dm.content, dm.timestamp
       order by dm.discord_message_id desc
      ) as rn, 
     dm.discord_message_id
    from discord_messages dm
     join (
       select author_user_id, content, timestamp
       from discord_messages
       group by 1,2,3
       having count(*) > 1
      ) t on dm.author_user_id = t.author_user_id
       and coalesce(dm.content, '') = coalesce(t.content, '')
       and dm.timestamp = t.timestamp
    ) dupes on m.discord_message_id = dupes.discord_message_id
  where dupes.rn > 1
);