MedicineSearch/find_brand_names.py

#!/usr/bin/env python3
#   Copyright © 2023  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import re
import sqlite3

LOOKS_LIKE_DOSE = re.compile(r'([0-9/.,+%]*( ?(mg|mL|U|IU))?)+')

# Open database
con = sqlite3.connect('database.db')
con.row_factory = sqlite3.Row
cur = con.cursor()

# Init schema
cur.execute('DROP TABLE IF EXISTS mp_brand_name')
cur.execute('CREATE TABLE mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_preferred_term TEXT, brand_name TEXT)')

# Get PBS brand names
cur.execute('SELECT brand_name, mp_preferred_term FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code')

brand_names = {}
for tpp in cur.fetchall():
	words = tpp['brand_name'].split()
	words_lower = tpp['brand_name'].lower().split()

	# If any word of the generic name is in the brand name, skip it because it is uninteresting
	if any(w.lower() in tpp['brand_name'].lower() for w in tpp['mp_preferred_term'].split() if w != '+'):
		continue

	# Ignore anything that looks like a company name
	if 'pty' in words_lower or 'ltd' in words_lower or 'australia' in words_lower:
		continue

	# Strip all trailing words that look like a dose
	for i in reversed(range(len(words))):
		if LOOKS_LIKE_DOSE.fullmatch(words[i]):
			del words[i]
		else:
			break

	# OK!
	brand_name = ' '.join(words)

	if tpp['mp_preferred_term'] not in brand_names:
		brand_names[tpp['mp_preferred_term']] = set()

	brand_names[tpp['mp_preferred_term']].add(brand_name)

# Get non-PBS brand names
cur.execute('SELECT * FROM non_pbs_tpp')
for tpp in cur.fetchall():
	# This is manually curated so no need for cleaning

	if tpp['mp_preferred_term'] not in brand_names:
		brand_names[tpp['mp_preferred_term']] = set()

	brand_names[tpp['mp_preferred_term']].add(tpp['brand_name'])

# Reduce names with unambiguous prefixes
for mp_preferred_term in sorted(brand_names.keys()):
	for brand_name in list(brand_names[mp_preferred_term]):
		# Can we reduce the length of the name?
		words = brand_name.split()

		for i in range(1, len(words)):
			short_name = ' '.join(words[0:i])

			# Conflict if ambiguous brand name prefix
			if any(b.startswith(short_name) for m in brand_names.keys() if m != mp_preferred_term for b in brand_names[m]):
				continue

			# Exceptions
			if short_name == 'Coloxyl with':
				continue  # Looks strange to cut off here
			if short_name == 'Magnesium':
				continue  # Disambiguates Magnesium Forte

			# Can shorten

			if brand_name in brand_names[mp_preferred_term]:
				brand_names[mp_preferred_term].remove(brand_name)

			# Don't list brand name if it is same as generic
			if short_name.lower() != mp_preferred_term.lower():
				brand_names[mp_preferred_term].add(short_name)

			break

# Add to database
for mp_preferred_term in sorted(brand_names.keys()):
	for brand_name in sorted(list(brand_names[mp_preferred_term])):
		cur.execute('INSERT INTO mp_brand_name (mp_preferred_term, brand_name) VALUES (?, ?)', (mp_preferred_term, brand_name))

con.commit()