MedicineSearch/find_pbs_brand_names.py

#!/usr/bin/env python3
#   Copyright © 2023  Lee Yingtong Li (RunasSudo)
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

import re
import sqlite3

LOOKS_LIKE_DOSE = re.compile(r'([0-9/.,+%]*( ?(mg|mL|U|IU))?)+')

# Open database
con = sqlite3.connect('database.db')
con.row_factory = sqlite3.Row
cur = con.cursor()

# Init schema
cur.execute('DROP TABLE IF EXISTS pbs_mp_brand_name')
cur.execute('CREATE TABLE pbs_mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_code STRING, brand_name STRING)')

cur.execute('SELECT * FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code')

brand_names = {}
for tpp in cur.fetchall():
	words = tpp['brand_name'].split()
	words_lower = tpp['brand_name'].lower().split()

	# If any word of the generic name is in the brand name, skip it because it is uninteresting
	if any(w.lower() in tpp['brand_name'].lower() for w in tpp['mp_preferred_term'].split() if w != '+'):
		continue

	# Ignore anything that looks like a company name
	if 'pty' in words_lower or 'ltd' in words_lower or 'australia' in words_lower:
		continue

	# Strip all trailing words that look like a dose
	for i in reversed(range(len(words))):
		if LOOKS_LIKE_DOSE.fullmatch(words[i]):
			del words[i]
		else:
			break

	# OK!
	brand_name = ' '.join(words)

	if tpp['mp_code'] not in brand_names:
		brand_names[tpp['mp_code']] = set()

	brand_names[tpp['mp_code']].add(brand_name)

# Reduce names with unambiguous prefixes
for mp_code in sorted(brand_names.keys()):
	for brand_name in list(brand_names[mp_code]):
		# Can we reduce the length of the name?
		words = brand_name.split()

		for i in range(1, len(words)):
			short_name = ' '.join(words[0:i])

			if any(b.startswith(short_name) for m in brand_names.keys() if m != mp_code for b in brand_names[m]):
				# Conflict
				continue

			# Can shorten
			if brand_name in brand_names[mp_code]:
				brand_names[mp_code].remove(brand_name)
			brand_names[mp_code].add(short_name)
			break

# Add to database
for mp_code in sorted(brand_names.keys()):
	for brand_name in sorted(list(brand_names[mp_code])):
		cur.execute('INSERT INTO pbs_mp_brand_name (mp_code, brand_name) VALUES (?, ?)', (mp_code, brand_name))

con.commit()