I've been having major headaches over inserting UTF-8 encoded data into MySQL 4.1.x The problem is many characters are being replaced with "?".
Languages I need to support are far eastern languages such as Japanese and Cryllic symbols.
Data is recieved in the form of XML. I'm using PHP5/MySQL 4.1 and libxml to parse XML.
I can ECHO the parsed XML and it appears FINE, but when I insert it into the database and retrieve the data some chracters are mangled.
I know that MySQL 4.1 now has full UTF-8/Unicode support, libxml too. Also note, i'm traversing the XML via DOM, this FORCES libxml to convert the parsed file to UTF-8 :-)
Heres some code:
[code]<?php
ob_start('ob_gzhandler');
header('Content-Type: text/html; charset=UTF-8');
mb_internal_encoding('UTF-8');
include 'System/runCheck.php';
runCheck(date('H'), date('i'));
function processData($str) {
// Snipped this, all it does is replace entitie codes with actual characters.
}
mysql_connect('XXX', 'XXX', 'XXX');
mysql_select_db('XXX');
$timeStart = time();
$feeds = array(ARRAY OF XML FEEDS);
foreach ($feeds as $feedURL) {
$checkTime = round(time() - $timeStart);
if ($checkTime < 150) {
if ($feed = new DOMDocument) {
$feed->strictErrorChecking = false;
$feed->preserveWhiteSpace = false;
$feed->load($feedURL);
switch (strtolower($feed->documentElement->tagName)) {
case 'rss';
$feedType = 'rss';
$feedVersion = $feed->documentElement->getAttribute('version');
$feedItemsElement = 'item';
break;
case 'feed';
$feedType = 'atom';
$feedVersion = $feed->documentElement->getAttribute('version');
$feedItemsElement = 'entry';
break;
case 'rdf:rdf':
$feedType = 'rdf';
$feedVersion = '1.0';
$feedItemsElement = 'item';
}
if (in_array($feedType, array('rss', 'atom', 'rdf'))) {
$feedTitle = $feed->getElementsByTagname('title')->item(0);
if (in_array(strtolower($feedTitle->parentNode->nodeName), array('channel', 'feed'))) $feedTitle = $feedTitle->textContent;
else $feedTitle = '';
echo "<h1>$feedTitle</h1>";
$items = $feed->getElementsByTagname($feedItemsElement);
foreach ($items as $item) {
unset($title, $body, $link);
foreach ($item->childNodes as $node) {
if ($feedType == 'atom') {
switch (strtolower($node->nodeName)) {
case 'title':
$title = processData($node->textContent);
break;
case 'content':
$body = processData($node->textContent);
break;
case 'summary':
if (!$body) $body = processData($node->textContent);
break;
case 'link':
$link = trim($node->getAttribute('href'));
break;
}
}
else {
switch (strtolower($node->nodeName)) {
case 'title':
$title = processData($node->textContent);
break;
case 'description':
$body = processData($node->textContent);
break;
case 'content:encoded':
if (!$body) $body = processData($node->textContent);
break;
case 'link':
$link = trim($node->textContent);
break;
}
}
}
$checksum = sha1($title . $body . $link);
$adCheckURLS = array('spam site url', 'spam site url', 'spam site url', 'spam site url', 'spam site url', 'viewRssAd.php');
unset($ad);
foreach ($adCheckURLS as $adURL) {
$check = strpos(strtolower($link), $adURL);
if ($check) {
$ad = true;
break;
}
}
if (!$ad && substr(strtolower($title), 0, 4)!= 'adv:' && strtolower($title)!= 'spam site') mysql_query("INSERT INTO items (title, feedtitle, body, feedurl, itemurl, lang, checksum, time) VALUES ('" . addslashes($title) . "', '" . addslashes($feedTitle) . "', '" . addslashes($body) . "', '" . addslashes($feedURL) . "', '" . addslashes($link) . "', 'xx', '$checksum', '" . time() . "')");
#echo "<h2 style=\"margin-bottom: 0\">$title</h2><p style=\"color: blue; margin: 0\">$body</p><p style=\"color: green; margin: 0\">$link</p>";
}
}
}
}
}
?>[code]