develooper Front page | perl.dbd.pg.changes | Postings from September 2011

[DBD::Pg] Another way of handling the UTF8 mess, per discussions on The Channel.

From:
dbdpg-commits
Date:
September 12, 2011 20:55
Subject:
[DBD::Pg] Another way of handling the UTF8 mess, per discussions on The Channel.
Message ID:
1315886137-368-1-git-send-email-dbdpg-commits@bucardo.org
Committed by Greg Sabino Mullane <greg@endpoint.com>

Another way of handling the UTF8 mess, per discussions on The Channel.

---
 Pg.pm    |   19 +++++----
 dbdimp.c |  134 +++++++++++++++++++++++++++++++++++++-------------------------
 dbdimp.h |    3 +-
 3 files changed, 92 insertions(+), 64 deletions(-)

diff --git a/Pg.pm b/Pg.pm
index 989a245..9459f63 100644
--- a/Pg.pm
+++ b/Pg.pm
@@ -1625,7 +1625,7 @@ use 5.006001;
 				pg_bool_tf                     => undef,
 				pg_db                          => undef,
 				pg_default_port                => undef,
-				pg_unicode                     => undef,
+				pg_utf8_flag                   => undef,
 				pg_enable_utf8                 => undef,
 				pg_errorlevel                  => undef,
 				pg_expand_array                => undef,
@@ -3122,19 +3122,20 @@ DBD::Pg specific attribute. Defaults to false. When true, question marks inside
 are not treated as L<placeholders|/Placeholders>. Useful for statements that contain unquoted question 
 marks, such as geometric operators.
 
-=head3 B<pg_unicode> (boolean)
+=head3 B<pg_utf8_flag> (boolean)
 
 DBD::Pg specific attribute. In normal use, this should not be needed, as it will be set 
-automatically according to the server encoding. SQL_ASCII will set this to false, while 
-everything else will set it to true. If you force it off, then everything will be returned 
-as byte soup, even data from UTF-8 databases, which is very likely not what you want. If 
-you force it on for SQL_ASCII databases, the results will be unpredictable. It is recommended 
-that you only use this attribute as a last resort and with a full understanding of what 
-it does.
+automatically according to the client encoding. If the client_encoding is 'UTF8', this 
+attribute will be turned on, which will cause strings coming back from the database to 
+be marked with Perl's internal utf8 flag. If you set this flag, then no checking of 
+client_encoding will ever be done. Do not use this flag unless you really know what 
+you are doing, and understand how utf8 differs from UTF8. Setting to 1 will always 
+cause the flag to be set. Setting to 0 will prevent the flag from ever being set. 
+Setting to -1 will switch to the default behavior of checking the client_encoding.
 
 =head3 B<pg_enable_utf8> (boolean)
 
-Deprecated, please use pg_unicode instead.
+Deprecated.
 
 =head3 B<pg_errorlevel> (integer)
 
diff --git a/dbdimp.c b/dbdimp.c
index 4a151b7..eb54f11 100644
--- a/dbdimp.c
+++ b/dbdimp.c
@@ -78,6 +78,7 @@ typedef enum
 
 static void pg_error(pTHX_ SV *h, int error_num, const char *error_msg);
 static void pg_warn (void * arg, const char * message);
+static void check_client_encoding(pTHX_ imp_dbh_t *imp_dbh);
 static ExecStatusType _result(pTHX_ imp_dbh_t *imp_dbh, const char *sql);
 static ExecStatusType _sqlstate(pTHX_ imp_dbh_t *imp_dbh, PGresult *result);
 static int pg_db_rollback_commit (pTHX_ SV *dbh, imp_dbh_t *imp_dbh, int action);
@@ -108,9 +109,6 @@ int dbd_db_login6 (SV * dbh, imp_dbh_t * imp_dbh, char * dbname, char * uid, cha
 	bool           inquote = DBDPG_FALSE;
 	STRLEN         connect_string_size;
 	ConnStatusType connstatus;
-	int            unicode;
-	const char *   server_encoding;
-	const char *   client_encoding;
 
 	if (TSTART) TRC(DBILOGFP, "%sBegin dbd_db_login\n", THEADER);
   
@@ -213,33 +211,22 @@ int dbd_db_login6 (SV * dbh, imp_dbh_t * imp_dbh, char * dbname, char * uid, cha
 	TRACE_PQPROTOCOLVERSION;
 	imp_dbh->pg_protocol = PQprotocolVersion(imp_dbh->conn);
 
-	/* Check the value of the pg_unicode attribute. Default to not set (-1) */
-	unicode = -1;
-	DBD_ATTRIB_GET_IV(attr, "pg_unicode", 10, svp, unicode);
-
-	/*
-	  We need to see if we are treating things with utf8 respect, or as byte soup
-	  The rules are:
-	  - An explicit pg_unicode setting trumps everything else
-	  - A server_encoding of SQL_ASCII is always byte soup
-      - If the client_encoding matches the server_encoding, set unicode on
-	  - Otherwise, we leave things alone
-	*/
-	client_encoding = PQparameterStatus(imp_dbh->conn, "client_encoding");
+	/* Check the value of the pg_utf8_flag attribute */
+	imp_dbh->pg_utf8_flag = -1;
+	DBD_ATTRIB_GET_IV(attr, "pg_utf8_flag", 12, svp, imp_dbh->pg_utf8_flag);
+	if (imp_dbh->pg_utf8_flag == -1) { /* Has not been explicitly set by the user */
+		/*
+		  Check the client_encoding. If UTF-8, set the flag on, else off
+		*/
+		imp_dbh->utf8_flag = (0 == strncmp(PQparameterStatus(imp_dbh->conn, "client_encoding"), "UTF8", 4))
+			? 0 : 1;
+	}
+	else {
+		/* We allow -1 and 0 direct, and force everything else to 1 */
+		if (imp_dbh->pg_utf8_flag < -1 || imp_dbh->pg_utf8_flag > 1)
+			imp_dbh->pg_utf8_flag = imp_dbh->pg_utf8_flag ? 1 : 0;
 
-	if (unicode > 1) { /* Force it on, no matter what */
-	  imp_dbh->unicode = DBDPG_TRUE;
-    }
-    else {
-		if (unicode == 0) { /* Force it off, no matter what */
-			imp_dbh->unicode = DBDPG_FALSE;
-		}
-		else { /* Neither is set, so check the encodings */
-			server_encoding = PQparameterStatus(imp_dbh->conn, "server_encoding");
-			/* If they match, set unicode to true, otherwise, false */
-			imp_dbh->unicode = (0==strcmp(server_encoding, client_encoding))
-					   ? DBDPG_TRUE : DBDPG_FALSE;
-		}
+		imp_dbh->utf8_flag = imp_dbh->pg_utf8_flag;
 	}
 
 	/* Figure out this particular backend's version */
@@ -286,12 +273,6 @@ int dbd_db_login6 (SV * dbh, imp_dbh_t * imp_dbh, char * dbname, char * uid, cha
 	/* Tell DBI that we should call disconnect when the handle dies */
 	DBIc_ACTIVE_on(imp_dbh);
 
-	/* If needed, set the client_encoding to UTF-8 */
-	if (imp_dbh->unicode &&
-		(0 != strncmp(client_encoding, "UTF-8", 5))) {
-		PQexec(imp_dbh->conn, "SET client_encoding = 'UTF-8'");
-	}
-
 	if (TEND) TRC(DBILOGFP, "%sEnd dbd_db_login\n", THEADER);
 
 	return 1;
@@ -323,7 +304,7 @@ static void pg_error (pTHX_ SV * h, int error_num, const char * error_msg)
 	sv_setpv(DBIc_STATE(imp_xxh), (char*)imp_dbh->sqlstate);
 
 	/* Set as utf-8 */
-	if (imp_dbh->unicode)
+	if (imp_dbh->utf8_flag)
 		SvUTF8_on(DBIc_ERRSTR(imp_xxh));
 
 	if (TEND) TRC(DBILOGFP, "%sEnd pg_error\n", THEADER);
@@ -387,7 +368,7 @@ static ExecStatusType _result(pTHX_ imp_dbh_t * imp_dbh, const char * sql)
 	if (TSQL) TRC(DBILOGFP, "%s;\n\n", sql);
 
 	/* Upgrade to a true UTF-8 string in place as needed */
-	if (imp_dbh->unicode) {
+	if (imp_dbh->utf8_flag) {
 		// upgrade_utf8 magic on 'sql'
 	}
 
@@ -396,6 +377,8 @@ static ExecStatusType _result(pTHX_ imp_dbh_t * imp_dbh, const char * sql)
 
 	status = _sqlstate(aTHX_ imp_dbh, result);
 
+	check_client_encoding(aTHX_ imp_dbh);
+
 	TRACE_PQCLEAR;
 	PQclear(result);
 
@@ -749,7 +732,7 @@ SV * dbd_db_FETCH_attrib (SV * dbh, imp_dbh_t * imp_dbh, SV * keysv)
 		}
 		break;
 
-	case 10: /* AutoCommit  pg_bool_tf  pg_pid_number  pg_options  pg_unicode */
+	case 10: /* AutoCommit  pg_bool_tf  pg_pid_number  pg_options  */
 
 		if (strEQ("AutoCommit", key))
 			retsv = boolSV(DBIc_has(imp_dbh, DBIcf_AutoCommit));
@@ -761,8 +744,6 @@ SV * dbd_db_FETCH_attrib (SV * dbh, imp_dbh_t * imp_dbh, SV * keysv)
 			TRACE_PQOPTIONS;
 			retsv = newSVpv(PQoptions(imp_dbh->conn),0);
 		}
-		else if (strEQ("pg_unicode", key))
-			retsv = newSViv((IV)imp_dbh->unicode);
 		break;
 
 	case 11: /* pg_INV_READ  pg_protocol */
@@ -773,10 +754,12 @@ SV * dbd_db_FETCH_attrib (SV * dbh, imp_dbh_t * imp_dbh, SV * keysv)
 			retsv = newSViv((IV)imp_dbh->pg_protocol);
 		break;
 
-	case 12: /* pg_INV_WRITE */
+	case 12: /* pg_INV_WRITE pg_utf8_flag */
 
 		if (strEQ("pg_INV_WRITE", key))
 			retsv = newSViv((IV) INV_WRITE );
+		else if (strEQ("pg_utf8_flag", key))
+			retsv = newSViv((IV)imp_dbh->utf8_flag);
 		break;
 
 	case 13: /* pg_errorlevel */
@@ -870,7 +853,7 @@ int dbd_db_STORE_attrib (SV * dbh, imp_dbh_t * imp_dbh, SV * keysv, SV * valuesv
 		}
 		break;
 
-	case 10: /* AutoCommit  pg_bool_tf  pg_unicode*/
+	case 10: /* AutoCommit  pg_bool_tf */
 
 		if (strEQ("AutoCommit", key)) {
 			if (newval != DBIc_has(imp_dbh, DBIcf_AutoCommit)) {
@@ -885,16 +868,28 @@ int dbd_db_STORE_attrib (SV * dbh, imp_dbh_t * imp_dbh, SV * keysv, SV * valuesv
 
 		else if (strEQ("pg_bool_tf", key)) {
 			imp_dbh->pg_bool_tf = newval!=0 ? DBDPG_TRUE : DBDPG_FALSE;
+			/* Only a few valid values */
+			if (imp_dbh->pg_utf8_flag == -1) {
+				/* Do nothing: same as if it is not set */
+			}
+			else if (imp_dbh->pg_utf8_flag == 0) {
+				imp_dbh->utf8_flag = 0;
+			}
+			else { /* Everything else is 'true' */
+				imp_dbh->utf8_flag = 1;
+				imp_dbh->pg_utf8_flag = 1;
+			}
 			retval = 1;
 		}
 
-		else if (strEQ("pg_unicode", key)) {
-			imp_dbh->unicode = newval!=0 ? DBDPG_TRUE : DBDPG_FALSE;
-			retval = 1;
-		}
+		break;
 
+	case 12: /* pg_utf8_flag */
 
-		break;
+		if (strEQ("pg_utf8_flag", key)) {
+			imp_dbh->pg_utf8_flag = (unsigned)SvIV(valuesv);
+			retval = 1;
+		}
 
 	case 13: /* pg_errorlevel */
 
@@ -1139,7 +1134,7 @@ SV * dbd_st_FETCH_attrib (SV * sth, imp_sth_t * imp_sth, SV * keysv)
 				TRACE_PQFNAME;
 				fieldname = PQfname(imp_sth->result, fields);
 				sv_fieldname = newSVpv(fieldname,0);
-				if (imp_dbh->unicode)
+				if (imp_dbh->utf8_flag)
 					SvUTF8_on(sv_fieldname);
 				(void)av_store(av, fields, sv_fieldname);
 			}
@@ -2713,7 +2708,7 @@ static SV * pg_destringify_array(pTHX_ imp_dbh_t *imp_dbh, unsigned char * input
 					av_push(currentav, newSViv('t' == *string ? 1 : 0));
 				else {
 					SV *sv = newSVpvn(string, section_size);
-					if (imp_dbh->unicode)
+					if (imp_dbh->utf8_flag)
 						SvUTF8_on(sv);
 					av_push(currentav, sv);
 				}
@@ -2842,14 +2837,17 @@ int pg_quickexec (SV * dbh, const char * sql, const int asyncflag)
 	if (TSQL) TRC(DBILOGFP, "%s;\n\n", sql);
 
 	/* Upgrade to a true UTF-8 string in place as needed */
-	if (imp_dbh->unicode) {
+	if (imp_dbh->utf8_flag) {
 		// upgrade_utf8 magic on 'sql'
 	}
 
 	TRACE_PQEXEC;
 	result = PQexec(imp_dbh->conn, sql);
+
 	status = _sqlstate(aTHX_ imp_dbh, result);
 
+	check_client_encoding(aTHX_ imp_dbh);
+
 	imp_dbh->copystate = 0; /* Assume not in copy mode until told otherwise */
 
 	if (TRACE4) TRC(DBILOGFP, "%sGot a status of %d\n", THEADER, status);
@@ -3313,6 +3311,8 @@ int dbd_st_execute (SV * sth, imp_sth_t * imp_sth)
 
 	status = _sqlstate(aTHX_ imp_dbh, imp_sth->result);
 
+	check_client_encoding(aTHX_ imp_dbh);
+
 	imp_dbh->copystate = 0; /* Assume not in copy mode until told otherwise */
 	if (PGRES_TUPLES_OK == status) {
 		TRACE_PQNFIELDS;
@@ -3396,6 +3396,31 @@ int dbd_st_execute (SV * sth, imp_sth_t * imp_sth)
 } /* end of dbd_st_execute */
 
 
+static void check_client_encoding(pTHX_ imp_dbh_t * imp_dbh)
+{
+
+	/* See if the client_encoding has changed */
+	if (imp_dbh->pg_utf8_flag == -1) { /* Only check if they have not set it themselves */
+		if (imp_dbh->utf8_flag) {
+			if (0 != strncmp(PQparameterStatus(imp_dbh->conn, "client_encoding"), "UTF8", 4)) {
+				imp_dbh->utf8_flag = 0;
+				if (TRACE4)
+					TRC(DBILOGFP, "%sclient_encoding change caused utf8 flag to change from on to off\n",
+						THEADER);
+			}
+		}
+		else {
+			if (0 == strncmp(PQparameterStatus(imp_dbh->conn, "client_encoding"), "UTF8", 4)) {
+				imp_dbh->utf8_flag = 1;
+				if (TRACE4)
+					TRC(DBILOGFP, "%sclient_encoding change caused utf8 flag to change from off to on\n",
+						THEADER);
+			}
+		}
+	}
+}
+
+
 /* ================================================================== */
 AV * dbd_st_fetch (SV * sth, imp_sth_t * imp_sth)
 {
@@ -3495,7 +3520,7 @@ AV * dbd_st_fetch (SV * sth, imp_sth_t * imp_sth)
 						break;
 					default:
 						sv_setpvn(sv, (char *)value, value_len);
-						if (imp_dbh->unicode)
+						if (imp_dbh->utf8_flag)
 							SvUTF8_on(sv);
 					}
 				}
@@ -3503,7 +3528,7 @@ AV * dbd_st_fetch (SV * sth, imp_sth_t * imp_sth)
 					value_len = strlen((char *)value);
 					sv_setpvn(sv, (char *)value, value_len);
 					/* Check for specific types here? */
-					if (imp_dbh->unicode)
+					if (imp_dbh->utf8_flag)
 						SvUTF8_on(sv);
 				}
 			
@@ -3533,7 +3558,7 @@ AV * dbd_st_fetch (SV * sth, imp_sth_t * imp_sth)
 				*/
 				const char * const s = SvPV(AvARRAY(av)[i],len);
 				sv_setpvn(currph->inout, s, len);
-				if (imp_dbh->unicode)
+				if (imp_dbh->utf8_flag)
 					SvUTF8_on(currph->inout);
 			}
 		}
@@ -3879,7 +3904,7 @@ int pg_db_getcopydata (SV * dbh, SV * dataline, int async)
 
 	if (copystatus > 0) {
 		sv_setpv(dataline, tempbuf);
-		if (imp_dbh->unicode)
+		if (imp_dbh->utf8_flag)
 			SvUTF8_on(dataline);
 		TRACE_PQFREEMEM;
 		PQfreemem(tempbuf);
@@ -4688,6 +4713,7 @@ int pg_db_result (SV *h, imp_dbh_t *imp_dbh)
 	while ((result = PQgetResult(imp_dbh->conn)) != NULL) {
 		/* TODO: Better multiple result-set handling */
 		status = _sqlstate(aTHX_ imp_dbh, result);
+		check_client_encoding(aTHX_ imp_dbh);
 		switch (status) {
 		case PGRES_TUPLES_OK:
 			TRACE_PQNTUPLES;
diff --git a/dbdimp.h b/dbdimp.h
index b30ceaf..a5176d2 100644
--- a/dbdimp.h
+++ b/dbdimp.h
@@ -24,14 +24,15 @@ struct imp_dbh_st {
 	int     pg_errorlevel;     /* PQsetErrorVerbosity. Set by user, defaults to 1 */
 	int     server_prepare;    /* do we want to use PQexecPrepared? 0=no 1=yes 2=smart. Can be changed by user */
 	int     async_status;      /* 0=no async 1=async started -1=async has been cancelled */
+    int     pg_utf8_flag;      /* what the user has set pg_utf8_flag to. -1 means not set */
 
     imp_sth_t *async_sth;      /* current async statement handle */
 	AV      *savepoints;       /* list of savepoints */
 	PGconn  *conn;             /* connection structure */
 	char    *sqlstate;         /* from the last result */
 
+    bool    utf8_flag;         /* are we setting the internal Perl utf8 flag on for incoming data? */
 	bool    pg_bool_tf;        /* do bools return 't'/'f'? Set by user, default is 0 */
-    bool    unicode;           /* do we force client_encoding to UTF-8 and set the Perl utf8 string on returned data? */
 	bool    pg_enable_utf8;    /* (DEPRECATED) should we attempt to make utf8 strings? Set by user, default is 0 */
 	bool    prepare_now;       /* force immediate prepares, even with placeholders. Set by user, default is 0 */
 	bool    done_begin;        /* have we done a begin? (e.g. are we in a transaction?) */
-- 
1.7.0.5




nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About