2023-01-24 19:58:17 +11:00
#!/usr/bin/env python3
2023-01-24 19:56:54 +11:00
# Copyright © 2023 Lee Yingtong Li (RunasSudo)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import re
import sqlite3
LOOKS_LIKE_DOSE = re . compile ( r ' ([0-9/.,+ % ]*( ?(mg|mL|U|IU))?)+ ' )
# Open database
con = sqlite3 . connect ( ' database.db ' )
con . row_factory = sqlite3 . Row
cur = con . cursor ( )
# Init schema
cur . execute ( ' DROP TABLE IF EXISTS pbs_mp_brand_name ' )
cur . execute ( ' CREATE TABLE pbs_mp_brand_name (id INTEGER PRIMARY KEY AUTOINCREMENT, mp_code STRING, brand_name STRING) ' )
cur . execute ( ' SELECT * FROM pbs_tpp LEFT JOIN (SELECT code, mp_code FROM pbs_mpp) AS pbs_mpp ON pbs_tpp.mpp_code = pbs_mpp.code LEFT JOIN (SELECT code, preferred_term as mp_preferred_term FROM pbs_mp) AS pbs_mp ON pbs_mpp.mp_code = pbs_mp.code ' )
brand_names = { }
for tpp in cur . fetchall ( ) :
words = tpp [ ' brand_name ' ] . split ( )
words_lower = tpp [ ' brand_name ' ] . lower ( ) . split ( )
# If any word of the generic name is in the brand name, skip it because it is uninteresting
if any ( w . lower ( ) in tpp [ ' brand_name ' ] . lower ( ) for w in tpp [ ' mp_preferred_term ' ] . split ( ) if w != ' + ' ) :
continue
# Ignore anything that looks like a company name
if ' pty ' in words_lower or ' ltd ' in words_lower or ' australia ' in words_lower :
continue
# Strip all trailing words that look like a dose
for i in reversed ( range ( len ( words ) ) ) :
if LOOKS_LIKE_DOSE . fullmatch ( words [ i ] ) :
del words [ i ]
else :
break
# OK!
brand_name = ' ' . join ( words )
if tpp [ ' mp_code ' ] not in brand_names :
brand_names [ tpp [ ' mp_code ' ] ] = set ( )
brand_names [ tpp [ ' mp_code ' ] ] . add ( brand_name )
# Reduce names with unambiguous prefixes
for mp_code in sorted ( brand_names . keys ( ) ) :
for brand_name in list ( brand_names [ mp_code ] ) :
# Can we reduce the length of the name?
words = brand_name . split ( )
for i in range ( 1 , len ( words ) ) :
short_name = ' ' . join ( words [ 0 : i ] )
if any ( b . startswith ( short_name ) for m in brand_names . keys ( ) if m != mp_code for b in brand_names [ m ] ) :
# Conflict
continue
# Can shorten
if brand_name in brand_names [ mp_code ] :
brand_names [ mp_code ] . remove ( brand_name )
brand_names [ mp_code ] . add ( short_name )
break
# Add to database
for mp_code in sorted ( brand_names . keys ( ) ) :
for brand_name in sorted ( list ( brand_names [ mp_code ] ) ) :
cur . execute ( ' INSERT INTO pbs_mp_brand_name (mp_code, brand_name) VALUES (?, ?) ' , ( mp_code , brand_name ) )
con . commit ( )