pidgin: 4856b929: * Change yahoo_html_to_codes() a little ...

Wed Aug 19 18:09:16 EDT 2009

-----------------------------------------------------------------
Revision: 4856b929b0811c3ef6e0fa552e431bdcabaf5223
Ancestor: 1fa60074b939d7c3827e843242f1532560f6f4a6
Author: markdoliner at pidgin.im
Date: 2009-08-19T22:01:10
Branch: im.pidgin.pidgin
URL: http://d.pidgin.im/viewmtn/revision/info/4856b929b0811c3ef6e0fa552e431bdcabaf5223

Modified files:
        libpurple/protocols/yahoo/libymsg.h
        libpurple/protocols/yahoo/util.c
        libpurple/tests/test_yahoo_util.c

ChangeLog: 

* Change yahoo_html_to_codes() a little to hopefully be more straight forward
  and more similar to yahoo_codes_to_html()
* Add documentation to yahoo_html_to_codes() that explains the differences
  between the encoded text that we send and the encoded text sent by yahoo
* Enable the test caes for yahoo_html_to_codes()

-------------- next part --------------
============================================================

--- libpurple/protocols/yahoo/libymsg.h	c505b82f296148f9159201e9c12d8dc3fcf02bf1
+++ libpurple/protocols/yahoo/libymsg.h	22e537c8bea812721238661dda7ebf4fe877a8dd
@@ -280,6 +280,27 @@ char *yahoo_codes_to_html(const char *x)
 void yahoo_init_colorht(void);
 void yahoo_dest_colorht(void);
 char *yahoo_codes_to_html(const char *x);
+
+/**
+ * This function takes a normal HTML message and converts it to the message
+ * format used by Yahoo, which uses a frankensteinish combination of ANSI
+ * escape codes and broken HTML.
+ *
+ * It results in slightly different output than would be sent by official
+ * Yahoo clients.  The two main differences are:
+ *
+ * 1. We always close all tags, whereas official Yahoo clients leave tags
+ *    dangling open at the end of each message (and the client treats them
+ *    as closed).
+ * 2. We always close inner tags first before closing outter tags.
+ *
+ * For example, if you want to send this message:
+ *   <b> bold <i> bolditalic </i></b><i> italic </i>
+ * Official Yahoo clients would send:
+ *   ESC[1m bold ESC[2m bolditalic ESC[x1m italic
+ * But we will send:
+ *   ESC[1m bold ESC[2m bolditalic ESC[x2mESC[x1mESC[2m italic ESC[x2m
+ */
 char *yahoo_html_to_codes(const char *src);
 
 gboolean
============================================================
--- libpurple/protocols/yahoo/util.c	31666b2add695540ed3a138e979cdb9a4e0089bf
+++ libpurple/protocols/yahoo/util.c	bbd1e1c1e5aa6b0dd83bfecda4b1c757529a1f6c
@@ -669,7 +669,7 @@ static const gint _point_sizes [] = { 8,
 #define POINT_SIZE(x) (_point_sizes [MIN ((x > 0 ? x : 1), MAX_FONT_SIZE) - 1])
 static const gint _point_sizes [] = { 8, 10, 12, 14, 20, 30, 40 };
 
-enum fatype
+enum fontattr_type
 {
 	FATYPE_SIZE,
 	FATYPE_COLOR,
@@ -679,7 +679,7 @@ typedef struct
 
 typedef struct
 {
-	enum fatype type;
+	enum fontattr_type type;
 	union {
 		int size;
 		char *color;
@@ -688,6 +688,17 @@ typedef struct
 	} u;
 } fontattr;
 
+typedef struct
+{
+	gboolean bold;
+	gboolean italic;
+	gboolean underline;
+	gboolean in_link;
+	int font_size;
+	char *font_face;
+	char *font_color;
+} CurrentMsgState;
+
 static void fontattr_free(fontattr *f)
 {
 	if (f->type == FATYPE_COLOR)
@@ -876,167 +887,124 @@ char *yahoo_html_to_codes(const char *sr
 	GString *dest;
 	char *esc;
 	GQueue *ftattr = NULL;
-	gboolean no_more_specials = FALSE;
+	gboolean no_more_gt_brackets = FALSE;
+	gchar *tag, *tag_name;
+	gboolean is_closing_tag;
+	CurrentMsgState current_state;
 
+	bzero(&current_state, sizeof(current_state));
+
 	src_len = strlen(src);
 	dest = g_string_sized_new(src_len);
 
 	for (i = 0; i < src_len; i++) {
-
-		if (src[i] == '<' && !no_more_specials) {
+		if (src[i] == '<' && !no_more_gt_brackets) {
+			/* The start of an HTML tag  */
 			j = i;
 
-			while (1) {
-				j++;
+			while (j++ < src_len) {
+				if (src[j] != '>') {
+					if (src[j] == '"') {
+						/* We're inside a quoted attribute value. Skip to the end */
+						j++;
+						while (j != src_len && src[j] != '"')
+							j++;
+					} else if (src[j] == '\'') {
+						/* We're inside a quoted attribute value. Skip to the end */
+						j++;
+						while (j != src_len && src[j] != '\'')
+							j++;
+					}
+					if (j != src_len)
+						/* Keep looking for the end of this tag */
+						continue;
 
-				if (j >= src_len) { /* no '>' */
+					/* This < has no corresponding > */
 					g_string_append_c(dest, src[i]);
-					no_more_specials = TRUE;
+					no_more_gt_brackets = TRUE;
 					break;
 				}
 
-				if (src[j] == '<') {
-					/* FIXME: This doesn't convert outgoing entities.
-					 *        However, I suspect this case may never
-					 *        happen anymore because of the entities.
-					 */
-					g_string_append_len(dest, &src[i], j - i);
-					i = j - 1;
-					if (ftattr) {
-						fontattr *f;
+				tag = g_strndup(src + i, j - i + 1);
+				tag_name = yahoo_markup_get_tag_name(tag, &is_closing_tag);
 
-						while ((f = g_queue_pop_head(ftattr)))
-							fontattr_free(f);
-						g_queue_free(ftattr);
-						ftattr = NULL;
-					}
-					break;
-				}
-
-				if (src[j] == ' ') {
-					if (!g_ascii_strncasecmp(&src[i+1], "BODY", j - i - 1)) {
-						char *t = strchr(&src[j], '>');
-						if (!t) {
-							g_string_append(dest, &src[i]);
+				if (g_str_equal(tag_name, "a")) {
+					j += 7;
+					g_string_append(dest, "\033[lm");
+					if (purple_str_has_prefix(src + j, "mailto:"))
+						j += sizeof("mailto:") - 1;
+					while (1) {
+						g_string_append_c(dest, src[j]);
+						if (++j >= src_len) {
 							i = src_len;
 							break;
-						} else {
-							i = t - src;
-							break;
 						}
-					} else if (!g_ascii_strncasecmp(&src[i+1], "A HREF=\"", j - i - 1)) {
-						j += 7;
-						g_string_append(dest, "\033[lm");
-						if (purple_str_has_prefix(src + j, "mailto:"))
-							j += sizeof("mailto:") - 1;
-						while (1) {
-							g_string_append_c(dest, src[j]);
-							if (++j >= src_len) {
-								i = src_len;
-								break;
-							}
-							if (src[j] == '"') {
-								g_string_append(dest, "\033[xlm");
-								while (1) {
-									if (++j >= src_len) {
-										i = src_len;
-										break;
-									}
-									if (!g_ascii_strncasecmp(&src[j], "</A>", 4)) {
-										j += 3;
-										break;
-									}
+						if (src[j] == '"') {
+							g_string_append(dest, "\033[xlm");
+							while (1) {
+								if (++j >= src_len) {
+									i = src_len;
+									break;
 								}
-								i = j;
-								break;
+								if (!g_ascii_strncasecmp(&src[j], "</A>", 4)) {
+									j += 3;
+									break;
+								}
 							}
+							i = j;
+							break;
 						}
-					} else if (!g_ascii_strncasecmp(&src[i+1], "SPAN", j - i - 1)) { /* drop span tags */
-						while (1) {
-							if (++j >= src_len) {
-								g_string_append(dest, &src[i]);
-								i = src_len;
-								break;
-							}
-							if (src[j] == '>') {
-								i = j;
-								break;
-							}
-						}
-					} else if (g_ascii_strncasecmp(&src[i+1], "FONT", j - i - 1)) { /* not interested! */
-						while (1) {
-							if (++j >= src_len) {
-								g_string_append(dest, &src[i]);
-								i = src_len;
-								break;
-							}
-							if (src[j] == '>') {
-								g_string_append_len(dest, &src[i], j - i + 1);
-								i = j;
-								break;
-							}
-						}
-					} else { /* yay we have a font tag */
-						_parse_font_tag(src, dest, &i, &j, src_len, &colors, &tags, ftattr);
 					}
 
-					break;
-				}
-
-				if (src[j] == '>') {
-					/* This has some problems like the FIXME for the
-					 * '<' case. and like that case, I suspect the case
-					 * that this has problems is won't happen anymore anyway.
-					 */
-					int sublen = j - i - 1;
-
-					if (sublen) {
-						if (!g_ascii_strncasecmp(&src[i+1], "B", sublen)) {
-							g_string_append(dest, "\033[1m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/B", sublen)) {
-							g_string_append(dest, "\033[x1m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "I", sublen)) {
-							g_string_append(dest, "\033[2m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/I", sublen)) {
-							g_string_append(dest, "\033[x2m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "U", sublen)) {
-							g_string_append(dest, "\033[4m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/U", sublen)) {
-							g_string_append(dest, "\033[x4m");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/A", sublen)) {
-							g_string_append(dest, "\033[xlm");
-						} else if (!g_ascii_strncasecmp(&src[i+1], "BR", sublen)) {
-							g_string_append_c(dest, '\n');
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/BODY", sublen)) {
-							/* mmm, </body> tags. *BURP* */
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/SPAN", sublen)) {
-							/* </span> tags. dangerously close to </spam> */
-						} else if (!g_ascii_strncasecmp(&src[i+1], "/FONT", sublen) && tags != NULL) {
-							char *etag;
-
-							etag = tags->data;
-							tags = g_slist_delete_link(tags, tags);
-							if (etag) {
-								g_string_append(dest, etag);
-								if (!strcmp(etag, "</font>")) {
-									if (colors != NULL) {
-										g_free(colors->data);
-										colors = g_slist_delete_link(colors, colors);
-									}
-								}
-								g_free(etag);
+				} else if (g_str_equal(tag_name, "font")) {
+					_parse_font_tag(src, dest, &i, &j, src_len, &colors, &tags, ftattr);
+				} else if (g_str_equal(tag_name, "b")) {
+					g_string_append(dest, "\033[1m");
+					current_state.bold = TRUE;
+				} else if (g_str_equal(tag_name, "/b")) {
+					if (current_state.bold) {
+						g_string_append(dest, "\033[x1m");
+						current_state.bold = FALSE;
+					}
+				} else if (g_str_equal(tag_name, "i")) {
+					current_state.italic = TRUE;
+					g_string_append(dest, "\033[2m");
+				} else if (g_str_equal(tag_name, "/i")) {
+					if (current_state.italic) {
+						g_string_append(dest, "\033[x2m");
+						current_state.italic = FALSE;
+					}
+				} else if (g_str_equal(tag_name, "u")) {
+					current_state.underline = TRUE;
+					g_string_append(dest, "\033[4m");
+				} else if (g_str_equal(tag_name, "/u")) {
+					if (current_state.underline) {
+						g_string_append(dest, "\033[x4m");
+						current_state.underline = FALSE;
+					}
+				} else if (g_str_equal(tag_name, "/a")) {
+					g_string_append(dest, "\033[xlm");
+				} else if (g_str_equal(tag_name, "br")) {
+					g_string_append_c(dest, '\n');
+				} else if (g_str_equal(tag_name, "/font")) {
+					if (tags != NULL) {
+						char *etag = tags->data;
+						tags = g_slist_delete_link(tags, tags);
+						g_string_append(dest, etag);
+						if (g_str_equal(etag, "</font>")) {
+							if (colors != NULL) {
+								g_free(colors->data);
+								colors = g_slist_delete_link(colors, colors);
 							}
-						} else {
-							g_string_append_len(dest, &src[i], j - i + 1);
 						}
-					} else {
-						g_string_append_len(dest, &src[i], j - i + 1);
+						g_free(etag);
 					}
-
-					i = j;
-					break;
 				}
 
+				i = j;
+				g_free(tag);
+				g_free(tag_name);
+				break;
 			}
 
 		} else {
============================================================
--- libpurple/tests/test_yahoo_util.c	948b653aa8f524b8e4570e6e7dae24f843183ed5
+++ libpurple/tests/test_yahoo_util.c	efe0e7daa06d052dac47e925e354f9c0a41acc9c
@@ -104,7 +104,6 @@ END_TEST
 }
 END_TEST
 
-#if 0
 START_TEST(test_html_to_codes)
 {
 	assert_string_equal_free("plain",
@@ -129,7 +128,6 @@ START_TEST(test_html_to_codes)
 			yahoo_html_to_codes("plain &amp;"));
 
 	/* bold/italic/underline */
-	// MARK: This isn't correct.  Should not have the closing bold escape code
 	assert_string_equal_free("\x1B[1mbold\x1B[x1m",
 			yahoo_html_to_codes("<b>bold</b>"));
 	assert_string_equal_free("\x1B[2mitalic\x1B[x2m",
@@ -140,13 +138,12 @@ START_TEST(test_html_to_codes)
 			yahoo_html_to_codes("no</u> markup"));
 	assert_string_equal_free("\x1B[1mbold\x1B[x1m \x1B[2mitalic\x1B[x2m \x1B[4munderline\x1B[x4m",
 			yahoo_html_to_codes("<b>bold</b> <i>italic</i> <u>underline</u>"));
-	assert_string_equal_free("\x1B[1mbold \x1B[2mbolditalic\x1B[x1m italic\x1B[x1m",
+	assert_string_equal_free("\x1B[1mbold \x1B[2mbolditalic\x1B[x2m\x1B[x1m\x1B[2m italic\x1B[x2m",
 			yahoo_html_to_codes("<b>bold <i>bolditalic</i></b><i> italic</i>"));
-	assert_string_equal_free("\x1B[1mbold \x1B[2mbolditalic\x1B[x1m \x1B[4mitalicunderline",
+	assert_string_equal_free("\x1B[1mbold \x1B[2mbolditalic\x1B[x2m\x1B[x1m\x1B[2m \x1B[4mitalicunderline\x1B[x4m\x1B[x2m",
 			yahoo_html_to_codes("<b>bold <i>bolditalic</i></b><i> <u>italicunderline</u></i>"));
 }
 END_TEST
-#endif
 
 Suite *
 yahoo_util_suite(void)
@@ -161,11 +158,9 @@ yahoo_util_suite(void)
 	tcase_add_test(tc, test_codes_to_html);
 	suite_add_tcase(s, tc);
 
-#if 0
 	tc = tcase_create("Convert IM from HTML to network format");
 	tcase_add_test(tc, test_html_to_codes);
 	suite_add_tcase(s, tc);
-#endif
 
 	return s;
 }