Skip to content

Instantly share code, notes, and snippets.

@cypres
Last active August 29, 2015 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cypres/dcc12fbd8a17bdf1ff94 to your computer and use it in GitHub Desktop.
Save cypres/dcc12fbd8a17bdf1ff94 to your computer and use it in GitHub Desktop.
Sample code to count number of chars in a SMS
#include <cassert> // cassert
#include <cmath> // ceil
#include <iostream> // cout
#include <cstring> // memset, memcpy
// Given a UTF-8 encoded string, calculate the length of the resulting GSM
// 03.38 converted string.
// It assumes the input is valid UTF-8 and UTF-8 chars that can not be
// represented will be replaced with a single ? char.
// High optimized code, runs best with clang and optimizations (-O) enabled.
size_t SmsLength(const char *str) {
size_t l = 0;
signed char c;
for (; ; ++l) {
c = *str;
if (c == '\0') break;
if (__builtin_expect(c > 0, 1)) {
// This is ASCII
++str;
switch (c) { // Check if this a special char in GSM 03.38 context?
default: break;
case '\'': case '{': case '}': case '~':
case '[': case ']': case '\\': case '|':
++l; // Add an extra since these chars will be escaped
break;
}
} else {
// The only UTF-8 char that needs to be escaped for GSM 03.38 is €
// All others that are not supported will be replaced with a single ?
if (__builtin_expect(c == '\xE2', 0) &&
(*(str+1)) == '\x82' &&
(*(str+2)) == '\xAC') {
++l;
}
// Skip ahead based on the type of UTF-8 escape sequence
switch (0xF0 & c) {
case 0xE0:
str += 3;
break;
case 0xF0:
str += 4;
break;
default:
str += 2;
break;
}
}
}
return l;
}
// Get the number of SMSes an UTF-8 string will be split into
// Assumes
// - The string will be sent using GSM 03.38 encoding
// - The SMSC will use 8-bit reference numbers, leaving 153 chars pr. SMS
// If message contains unicode chars outside GSM 03.38 it should be sent as
// UCS-2, with lower amounts of chars for each split, see comments below.
int SmsSplits(const char *str) {
size_t length = SmsLength(str);
size_t chars_per_sms = 153; // 152 for 16-bit reference numbers, 66 for UCS-2
size_t chars_single_sms = 160; // 70 for UCS-2
if (length <= chars_single_sms) {
return 1;
} else {
return ceil(static_cast<double>(length)/chars_per_sms);
}
}
// Compile with: `clang++ -Ofast -g smslen.cc -o smslen`
// Run `time ./smslen`
int main() {
const char t[] = "Hej v€rden. Hvordan har du det i dag? "
"Jeg glæder mig til at se dig.";
// Benchmark length calculation
for (int i = 0; i < 10000000; i++) {
assert(SmsLength(t) == 68);
}
// Try some unicode
assert(SmsLength("~€") == 4);
assert(SmsLength("こんにちは") == 5); // would be sent as ????? with GSM
// Various test cases for SmsSplits()
const char s160[] = "Lorem ipsum dolor sit amet, consectetur adipiscing "
"elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. Nam ornare "
"augue vel nisi tempus, id cras amet.";
int split160 = SmsSplits(s160);
std::cout << "SMS with 160 bytes of lipsum: " << split160 << std::endl;
assert(split160 == 1);
char s161[160];
memcpy(s161, s160, 160);
s161[159] = '~'; // ~ is two GSM 03.38 chars, so the result will be 161
int split161 = SmsSplits(s161);
std::cout << "SMS with 161 bytes of lipsum: " << split161 << std::endl;
assert(split161 == 2);
char s305[306] = { 0 };
memset(s305, 'a', 305);
int split305 = SmsSplits(s305);
std::cout << "SMS with 305 'a's: " << split305 << std::endl;
assert(split305 == 2);
return 0;
}
<?php
function countGsm0338Length($utf8String)
{
$len = mb_strlen($utf8String,'utf-8');
$len += preg_match_all('/[\\^{}\\\~€|\\[\\]]/mu',$utf8String,$m);
return $len;
}
function countSmsSplits($message)
{
$len = countGsm0338Length($message);
$chars_per_sms = 153; // 152 for 16-bit reference numbers, 66 for UCS-2
$chars_single_sms = 160; // 70 for UCS-2
if ($len <= $chars_single_sms) {
return 1;
} else {
return ceil($len/$chars_per_sms);
}
}
$t = "Hej v€rden. Hvordan har du det i dag? Jeg glæder mig til at se dig.";
assert(countGsm0338Length($t) == 68);
assert(countSmsSplits($t) == 1);
$s160 = "Lorem ipsum dolor sit amet, consectetur adipiscing ";
$s160 .= "elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. ";
$s160 .= "Nam ornare augue vel nisi tempus, id cras amet.";
assert(countSmsSplits($s160) == 1);
$s160[160] = '~';
assert(countSmsSplits($s160) == 2);
$s305 = str_repeat('Lorem', 61);
assert(countSmsSplits($s305) == 2);
# coding=utf8
from __future__ import division
import re
from math import ceil
def smslen(message):
if not isinstance(message, unicode):
raise Exception('Need unicode aware strings to get sms len')
escaped = re.findall(ur'[\^{}\~€|\[\]]', message, flags=re.M+re.U)
return len(message) + len(escaped)
def smssplit(message):
chars_per_sms = 153; # 152 for 16-bit reference numbers, 66 for UCS-2
chars_single_sms = 160; # 70 for UCS-2
length = smslen(message)
if length <= chars_single_sms:
return 1
else:
return ceil(length / chars_per_sms)
t = unicode("Hej v€rden. Hvordan har du det i dag? "
"Jeg glæder mig til at se dig.", 'utf8')
assert(smslen(t) == 68)
assert(smslen(u"こんにちは") == 5)
s160 = (u"Lorem ipsum dolor sit amet, consectetur adipiscing "
"elit. Nunc auctor sem tellus, in laoreet enim semper laoreet. Nam ornare "
"augue vel nisi tempus, id cras amet.")
assert(smssplit(s160) == 1)
s161 = s160[:159] + '~'
assert(smssplit(s161) == 2)
s305 = u"Lorem" * 61
assert(smssplit(s305) == 2)
function CountSmsSplits(string message):
length := number of UTF-8 decoded characters in message; // unicode aware, ie. mb_strlen
// Add number of times these specials chars occurs in message: '^{}~€[]\|', ie. with regex.
length := length + count of regular expression matches("/[\\^{}\\\~€|\\[\\]]/mu", string)
// Calculate number of splits
charsPerMessageInChain := 153 // 152 for 16-bit reference numbers, 66 for UCS-2
charsSingleMessage := 160 // 70 for UCS-2
if length <= charsSingleMessage:
return 1
else:
return ceiling(length / charsPerMessageInChain)
end if
end function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment