#!/usr/bin/env python3 # Copyright © 2023 Lee Yingtong Li (RunasSudo) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import re import sqlite3 LOOKS_LIKE_DOSE = re.compile(r'([0-9/.,+%]*( ?(mg|mL|U|IU))?)+') # Open database con = sqlite3.connect('database.db') con.row_factory = sqlite3.Row cur = con.cursor() # Init schema cur.execute('DROP TABLE IF EXISTS mp_brand_name') cur.execute('CREATE TABLE mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_preferred_term TEXT, brand_name TEXT)') # Get PBS brand names cur.execute('SELECT brand_name, mp_preferred_term FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code') brand_names = {} for tpp in cur.fetchall(): words = tpp['brand_name'].split() words_lower = tpp['brand_name'].lower().split() # If any word of the generic name is in the brand name, skip it because it is uninteresting if any(w.lower() in tpp['brand_name'].lower() for w in tpp['mp_preferred_term'].split() if w != '+'): continue # Ignore anything that looks like a company name if 'pty' in words_lower or 'ltd' in words_lower or 'australia' in words_lower: continue # Strip all trailing words that look like a dose for i in reversed(range(len(words))): if LOOKS_LIKE_DOSE.fullmatch(words[i]): del words[i] else: break # OK! brand_name = ' '.join(words) if tpp['mp_preferred_term'] not in brand_names: brand_names[tpp['mp_preferred_term']] = set() brand_names[tpp['mp_preferred_term']].add(brand_name) # Get non-PBS brand names cur.execute('SELECT * FROM non_pbs_tpp') for tpp in cur.fetchall(): # This is manually curated so no need for cleaning if tpp['mp_preferred_term'] not in brand_names: brand_names[tpp['mp_preferred_term']] = set() brand_names[tpp['mp_preferred_term']].add(tpp['brand_name']) # Reduce names with unambiguous prefixes for mp_preferred_term in sorted(brand_names.keys()): for brand_name in list(brand_names[mp_preferred_term]): # Can we reduce the length of the name? words = brand_name.split() for i in range(1, len(words)): short_name = ' '.join(words[0:i]) # Conflict if ambiguous brand name prefix if any(b.startswith(short_name) for m in brand_names.keys() if m != mp_preferred_term for b in brand_names[m]): continue # Exceptions if short_name == 'Coloxyl with': continue # Looks strange to cut off here if short_name == 'Magnesium': continue # Disambiguates Magnesium Forte # Can shorten if brand_name in brand_names[mp_preferred_term]: brand_names[mp_preferred_term].remove(brand_name) # Don't list brand name if it is same as generic if short_name.lower() != mp_preferred_term.lower(): brand_names[mp_preferred_term].add(short_name) break # Add to database for mp_preferred_term in sorted(brand_names.keys()): for brand_name in sorted(list(brand_names[mp_preferred_term])): cur.execute('INSERT INTO mp_brand_name (mp_preferred_term, brand_name) VALUES (?, ?)', (mp_preferred_term, brand_name)) con.commit()