#!/usr/bin/env python # -*- coding: utf-8 -*- # Local path and filenames blocklist_path = "/home/foo/path/to/update_iceshrimp_net_blocklist/" blocklist_filename = "blocklist_multiple.txt" blocklist_log_path = "/home/foo/Logs/" blocklist_log_filename = "iceshrimp_net_blocklist_multiple_v2_updated" # The url library import urllib.request # The pandas library (pip install pandas) import pandas as pd # The SQL_Alchemy library (pip3 install sqlalchemy) from sqlalchemy import create_engine, text # The OS library (for checking whether files and directories exist) import os # Check that a 'blocklist_path' directory exists if os.path.isdir(blocklist_path): print("A 'blocklist_path' directory exists, continuing...") print("") else: print("The 'blocklist_path' directory does not exist, exiting...") print("") quit() # Check that a 'blocklist_log_path' directory exists (the most common error when running for the first time) if os.path.isdir(blocklist_log_path): print("A 'blocklist_log_path' directory exists, continuing...") print("") else: print("The 'blocklist_log_path' directory does not exist, exiting...") print("") quit() # Establish a SQL_Alchemy DB connection alchemyConn = create_engine('postgresql://iceshrimp.user:super-long-password@localhost:5432/iceshrimp.database') # Uncomment only one 'delete_statement' # Remove nothing from the 'public.blocked_instance' table, importing the blocklist(s) and doing nothing else (this is the default) delete_statement = text("DELETE FROM public.blocked_instance where host is null") # Remove all of the domains from the 'public.blocked_instance' table that are followed by a single account (replace 'your_user_id' with a correct 'followerId') #delete_statement = text("DELETE FROM public.blocked_instance WHERE host IN (SELECT "'"followeeHost"'" AS "'"followed_domains"'" FROM public.following WHERE "'"followerId"'" LIKE 'your_user_id' GROUP BY "'"followed_domains"'")") # Remove all of the domains from the 'public.blocked_instance' table that are followed by all accounts #delete_statement = text("DELETE FROM public.blocked_instance WHERE host IN (SELECT "'"followeeHost"'" AS "'"followed_domains"'" FROM public.following GROUP BY "'"followed_domains"'")") # Check that a 'delete_statement' has been uncommented try: delete_statement except NameError: print("No 'delete_statement' has been chosen, exiting...") print("") quit() else: print("A 'delete_statement' has been chosen, continuing...") print("") # Check that a 'delete_statement' has a valid 'followerId' if "your_user_id" in str(delete_statement): print("Invalid 'followerId' in the 'delete_statement', exiting...") print("") quit() else: print("The 'delete_statement' is valid, continuing...") print("") # Python pathname pattern matching import glob # Download all of the *.csv files in 'blocklist.txt' into the current directory with open(blocklist_path+blocklist_filename) as f: for line in f: url = line blocklist_filename = url.split('/', -1)[-1] urllib.request.urlretrieve(url, blocklist_path+blocklist_filename.rstrip('\n')) # Put all of the *.csv files in the current directory into a single DataFrame (df_original) files = glob.glob(blocklist_path+"*.csv") content = [] # Loop through all the *.csv files for filename in files: df_original = pd.read_csv(filename, index_col=None) print(len(df_original.index), "rows in", filename) print("") content.append(df_original) # Convert 'content' to a DataFrame df_original = pd.concat(content) # DROP extraneous column by COPY'ing DataFrame (df_original) to a new DataFrame (df_trimmed) df_trimmed = df_original[['domain', 'public_comment']].copy() # Rename a couple of the DataFrame (df_trimmed) columns to match the current 'public.blocked_instance' schema df_trimmed.rename(columns={'domain': 'host', 'public_comment': 'reason'}, inplace=True) # Print the # of DataFrame (df_trimmed) rows print(len(df_trimmed.index), "TOTAL rows in the DataFrame (df_trimmed)") print("") # Make a unique DataFrame (df_unique) df_unique = df_trimmed.drop_duplicates(subset=['host'], keep='last') # Print the # of rows in the unique DataFrame (df_unique) print(len(df_unique.index), "UNIQUE rows in the DataFrame (df_unique)") print("") # Execute and error-trap try: # DELETE any existing records in 'public.blocked_instance' with alchemyConn.connect() as conn: truncate_statement = text("TRUNCATE TABLE blocked_instance") conn.execute(truncate_statement) # Commit the TRUNCATE changes to the database conn.commit() print("TRUNCATE of 'public.blocked_instance' table COMMITTED") print("") # INSERT the DataFrame (df_unique) into the 'public.blocked_instance' table df_unique.to_sql("blocked_instance", alchemyConn, if_exists='append', index=False) print(len(df_unique.index), "new rows INSERTED into 'public.blocked_instance' table from the DataFrame (df_unique)") print("") # DELETE an aggregated list of Domains currently followed from the 'public.blocked_instance' table conn.execute(delete_statement) # Commit the DELETE changes to the database conn.commit() print("DELETE of 'followed_domains' from 'public.blocked_instance' table COMMITTED") print("") # Count the number of records now in the 'public.blocked_instance' table count_statement = text("SELECT count(*) AS exact_count FROM public.blocked_instance") row_count = conn.execute(count_statement) count = row_count.scalar() # Send a message print(count, "rows now in 'public.blocked_instance' table after 'followed_domains' DELETED") print("") # Close the alchemyConn connection alchemyConn.dispose() print("Connection CLOSED") print("") # Create a log file to show that the UPDATE has successfully completed open(blocklist_log_path+blocklist_log_filename, 'w') print("Log file CREATED") except: # Roll back in case of error with alchemyConn.connect() as conn: conn.rollback() print("FAILED to INSERT from the DataFrame") print("FAILED to DELETE and rolled back")