xaprb/analyze_text.pl

## analyze_text.pl
#!/usr/bin/perl

use strict;
use warnings FATAL => 'all';

use English qw(-no_match_vars);
use Lingua::EN::Fathom;
use TeX::Hyphen;
use List::Util qw(min);

my $filename = shift or die "Give me a filename";

my $hyp = new TeX::Hyphen();
   # file     => 'hyphen.tex',
   # style    => 'czech',
   # leftmin  => 2,
   # rightmin => 2,
   # my @points = $hyp->hyphenate($word);

my $f = new Lingua::EN::Fathom;

# First print out a report on the whole file.
$f->analyse_file($filename);
print($f->report);

=pod
Number of characters       : 65102
Number of words            : 10707
Percent of complex words   : 15.36
Average syllables per word : 1.6678
Number of sentences        : 579
Average words per sentence : 18.4922
Number of text lines       : 246
Number of blank lines      : 248
Number of paragraphs       : 246


READABILITY INDICES

Fog                        : 13.5424
Flesch                     : 46.9706
Flesch-Kincaid             : 11.3019
=cut

=pod
# Print out the longest words and their frequency.
my %words = $f->unique_words;
my @longest = reverse sort {
      length($a) * $words{$a} <=> length($b) * $words{$b}
   } keys %words;
@longest = grep { length($_) > 4 } @longest;
foreach my $i ( 0 .. min(30, $#longest ) ) {
   printf("%-30s %5d\n", $longest[$i], $words{$longest[$i]});
}
=cut

# Now find the longest and least readable paragraphs, and the sentences with the
# most syllables.  These metrics are a 1 if more is better, and a 0 else.
my %metrics = (
   num_chars             => 0,
   num_words             => 0,
   percent_complex_words => 0,
   num_sentences         => 0,
   syllables_per_word    => 0,
   words_per_sentence    => 0,
   fog                   => 0,
   flesch                => 1,
   kincaid               => 0,
);

my @stats;
my %syll_for;
open my $fi, "<", $filename or die $OS_ERROR;
while ( my $line = <$fi> ) {
   chomp $line;
   next unless $line;

   my $st = { line => $line };
   $f->analyse_block($line);
   foreach my $metric ( keys %metrics ) {
      $st->{$metric} = $f->$metric();
   }

   # My own metric: syllables per paragraph and syllables per sentence.
   $st->{tex_syllables} = 0;
   foreach my $sentence ( split(/[.!?]\s+(?=[A-Z])/, $line ) ) {
      $sentence =~ s/[^\w\s]+//g;
      foreach my $word ( split(/\s+/, $sentence ) ) {
         my $hy = $hyp->hyphenate($word);
         $syll_for{$sentence} += $hy;
         $st->{tex_syllables} += $hy;
      }
   }

   push @stats, $st;
}
close $fi;

my %worst_overall;

foreach my $metric ( qw(tex_syllables), keys %metrics ) {
   my @top = sort { $a->{$metric} <=> $b->{$metric} } @stats;
   if ( !$metrics{$metric} ) {
      @top = reverse @top;
   }
   if ( $metric eq 'flesch' ) {
      @top = grep { $_->{flesch} != 0 } @top;
   }
   print "====================== $metric\n";
   foreach my $i ( 0 .. min(30, $#top) ) {
      my $st = $top[$i];
      printf("%6.4f %s\n", $st->{$metric}, substr($st->{line}, 0, 70));
      $worst_overall{$st->{line}}++;
   }
   print "\n";
}

# Syllables-per-sentence report.
my @top = reverse sort { $syll_for{$a} <=> $syll_for{$b} } keys %syll_for;
print "====================== syllables per sentence\n";
foreach my $i ( 0 .. min(30, $#top) ) {
   my $sent = $top[$i];
   print $syll_for{$sent}, ' ', $sent, "\n";
}

# Lines that showed up as bad in most of the Fathom reports.
@top = reverse sort { $worst_overall{$a} <=> $worst_overall{$b} } keys %worst_overall;
print "\n====================== worst overall\n";
foreach my $i ( 0 .. min(30, $#top) ) {
   my $sent = $top[$i];
   print $worst_overall{$sent}, ' ', substr($sent, 0, 70), "\n";
}

## ch04.txt
File name                  : ch04-Schema_Optimization-clean.txt
Number of characters       : 117260
Number of words            : 19970
Percent of complex words   : 13.28
Average syllables per word : 1.5864
Number of sentences        : 1068
Average words per sentence : 18.6985
Number of text lines       : 430
Number of blank lines      : 430
Number of paragraphs       : 430


READABILITY INDICES

Fog                        : 12.7934
Flesch                     : 53.6441
Flesch-Kincaid             : 10.4223
====================== tex_syllables
68.0000 Another reason to create cache tables is for optimizing search and ret
62.0000 This design may appeal to developers, because it lets them work in a c
57.0000 By now, you can probably see the pattern: if a user wants to see both
57.0000 However, it's easy to get into trouble by over optimizing queries. For
55.0000 Both floating point and DECIMAL types let you specify the desired prec
54.0000 Once you choose a type, make sure you use the same type in all related
53.0000 The most common way to denormalize data is to duplicate, or cache, cer
53.0000 In most storage engines, an index can cover queries that access only c
52.0000 An inexperienced user might think this identifies the column's role as
52.0000 This query shows that increasing the prefix gives successively smaller
51.0000 The easiest way to understand indexing concepts is with an illustratio
50.0000 MySQL AB is working on improving this. Some of the upcoming improvemen
50.0000 Some kinds of data don't correspond directly to a built in type. A tim
50.0000 Schemas are often designed from E R (entity relationship) diagrams or
49.0000 Storage engines store B Tree indexes differently on disk. This can aff
48.0000 Be very careful with completely “random” strings, such as those pr
47.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
47.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co
47.0000 If the last_online restriction appears without the age restriction, or
47.0000 For high concurrency workloads, inserting in primary key order can act
46.0000 We assume the employee's name is unique and can be used as a primary k
46.0000 * Tables built upon clustered indexes are subject to page splits when
46.0000 With four bytes, the most frequent prefixes occur quite a bit more oft
46.0000 The data type's complexity is also important. For example, integer dat
45.0000 Now that you've learned more than you ever wanted to know about indexi
43.0000 This is the key reason for creating summary tables: to provide statist
43.0000 Redundant indexes usually appear when people add indexes to the table.
43.0000 # MySQL can't perform the LIKE operation in the index. This is a limit
43.0000 When an index covers the query, you'll see “Using Index” in the Ex
43.0000 Also in contrast to MyISAM, secondary indexes are very different from
42.0000 The terms “cache table” and “summary table” don't have a stand

====================== flesch
-44.5100 * Normalized updates are usually faster than denormalized updates.
-5.2585 The probability of a hash collision grows much faster than you might t
-4.4375 Here's an example: if you design a product table with an ENUM field to
8.1015 The differences between clustered and non clustered data layouts, and
8.3650 InnoDB supports transactions and four transaction isolation levels.
11.1754 Clustering the data by the primary key can be very beneficial to the a
13.1050 We've covered most important data type considerations, some of them wi
14.6785 Good performance depends on both an optimal schema and optimal indexin
15.7704 An inexperienced user might think this identifies the column's role as
16.8000 These advantages can boost performance tremendously, if you design you
18.1478 You can find a good prefix length by selecting a sample of the most fr
19.1505 * Covering indexes are especially helpful for InnoDB tables because of
21.3950 If the Extra column of EXPLAIN contains “Using temporary,” the que
23.4511 Schemas are often designed from E R (entity relationship) diagrams or
24.0141 This design may appeal to developers, because it lets them work in a c
24.3771 InnoDB stores the same data very differently because of its clustered
24.9186 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
26.0775 Normalization generally means the degree to which each fact is represe
26.3050 If you need quick lookups on multi dimensional data from any combinati
26.3050 Another way to calculate a good prefix length is by computing selectiv
26.6998 When you add indexes, try to extend existing indexes instead, because
26.9712 Any application that keeps counts in a table can eventually run into c
27.2553 * Secondary (non clustered) indexes are larger because their leaf node
27.4850 The table's data storage can also become fragmented. However, data sto
27.5142 Because of the space and computational cost, you should use DECIMAL on
27.5142 Covering indexes can be a very powerful tool, and can improve performa
27.6102 Prefix indexes are a great way to make indexes smaller and faster in m
28.0034 It's harder for MySQL to optimize queries that refer to nullable colum
28.0071 Another frequent performance nightmare is Object Relational Mapping (O
28.0500 * Pages become sparsely and irregularly filled because of splitting, s
28.6543 These limitations make hash indexes useful only in special cases. Howe

====================== fog
36.0000 Here's an example: if you design a product table with an ENUM field to
33.6706 The probability of a hash collision grows much faster than you might t
28.7172 You can find a good prefix length by selecting a sample of the most fr
28.2000 * Normalized updates are usually faster than denormalized updates.
27.9652 Clustering the data by the primary key can be very beneficial to the a
27.8571 An inexperienced user might think this identifies the column's role as
26.5412 The differences between clustered and non clustered data layouts, and
26.1306 Prefix indexes are a great way to make indexes smaller and faster in m
23.5373 A hash index is built on a hash table.<ref name="ftn5">See the compute
22.9333 These advantages can boost performance tremendously, if you design you
22.6507 Redundant indexes usually appear when people add indexes to the table.
22.0000 We've covered most important data type considerations, some of them wi
21.5692 Adding and removing the LIMIT changes the query plan, which lets us tw
21.5273 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
21.5158 Another frequent performance nightmare is Object Relational Mapping (O
21.4323 The only difference between the BLOB and TEXT families are that BLOB t
21.2865 For example, if you store the value b'00111001' (which is the binary e
21.2808 Be very careful with completely “random” strings, such as those pr
21.2780 Any application that keeps counts in a table can eventually run into c
21.1660 When you add indexes, try to extend existing indexes instead, because
21.0857 This works well because the MySQL query optimizer notices there's a sm
20.5846 Good performance depends on both an optimal schema and optimal indexin
20.3024 Even with the index, the query can be slow if the user interface is pa
20.3000 Another way to calculate a good prefix length is by computing selectiv
20.3000 If you need quick lookups on multi dimensional data from any combinati
20.2154 The main reason to keep redundant indexes is when extending an existin
20.0795 If someone talks about an index without mentioning a type, it's probab
20.0000 Now suppose we add another WHERE criterion that we know will reduce th
19.7091 * Covering indexes are especially helpful for InnoDB tables because of
19.7026 The moral of the story is, you should strive to insert data in primary
19.6308 Covering indexes can be a very powerful tool, and can improve performa

====================== num_words
153.0000 Another reason to create cache tables is for optimizing search and ret
127.0000 * Tables built upon clustered indexes are subject to page splits when
121.0000 Oddly enough, the first thing to decide is whether we have to use inde
118.0000 Both floating point and DECIMAL types let you specify the desired prec
113.0000 However, it's easy to get into trouble by over optimizing queries. For
112.0000 By now, you can probably see the pattern: if a user wants to see both
112.0000 If the last_online restriction appears without the age restriction, or
109.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co
105.0000 We assume the employee's name is unique and can be used as a primary k
105.0000 An inexperienced user might think this identifies the column's role as
103.0000 Here's the trick: even if there's a query that doesn't restrict the re
102.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
99.0000 Be very careful with completely “random” strings, such as those pr
98.0000 Once you choose a type, make sure you use the same type in all related
97.0000 You may have noticed we're insisting on keeping the age column at the
94.0000 This example shows how InnoDB can lock rows it doesn't really need, ev
94.0000 MySQL can sometimes use one index for both sorting and finding rows. T
93.0000 The easiest way to understand indexing concepts is with an illustratio
93.0000 This query shows that increasing the prefix gives successively smaller
93.0000 B Tree indexes work well for lookups by the full key value, a key rang
92.0000 The most common way to denormalize data is to duplicate, or cache, cer
92.0000 One way to illustrate this is by showing many variations on a single q
92.0000 If you require high performance, you must design your schema and index
91.0000 Redundant keys are a bit different from duplicated keys. If there is a
91.0000 Unlike some other database servers, you can't choose which index to cl
89.0000 Because InnoDB locks rows only when it accesses them, an index can red
89.0000 Some kinds of data don't correspond directly to a built in type. A tim
88.0000 If you need to store many true or false values, consider MySQL's nativ
86.0000 In most storage engines, an index can cover queries that access only c
86.0000 When an index covers the query, you'll see “Using Index” in the Ex
86.0000 Because it's really up to the storage engines to store the data, not a

====================== percent_complex_words
62.5000 * Normalized updates are usually faster than denormalized updates.
38.4615 Good performance depends on both an optimal schema and optimal indexin
37.5000 InnoDB supports transactions and four transaction isolation levels.
35.0000 B Tree indexes can become fragmented, which reduces performance. Fragm
35.0000 We've covered most important data type considerations, some of them wi
33.3333 * Insert speed depends heavily on inserting in clustered key (primary
33.3333 These advantages can boost performance tremendously, if you design you
32.3529 The differences between clustered and non clustered data layouts, and
32.1429 In addition to the Memory storage engine's explicit hash indexes, the
31.8182 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
30.7692 Your choice determines how MySQL stores the data, in memory and on dis
30.0000 * Covering indexes can use the primary key values contained at the lea
27.2727 * Covering indexes are especially helpful for InnoDB tables because of
27.0270 It's harder for MySQL to optimize queries that refer to nullable colum
26.7606 Storage engines store B Tree indexes differently on disk. This can aff
26.5306 These limitations make hash indexes useful only in special cases. Howe
26.4151 When you add indexes, try to extend existing indexes instead, because
26.3158 # Use REPAIR TABLE to build the table's indexes. This will build all i
26.3158 We benchmarked two cases. The first is inserting into a userinfo table
26.3158 * Secondary (non clustered) indexes are larger because their leaf node
25.8065 The data type's complexity is also important. For example, integer dat
25.0000 Sometimes you can create duplicate indexes without knowing it. For exa
24.3243 Integer types can optionally have the UNSIGNED attribute, which disall
24.2424 Schemas are often designed from E R (entity relationship) diagrams or
23.9130 Clustering the data by the primary key can be very beneficial to the a
23.5294 People who ask for help with performance issues are frequently advised
23.5294 Now you know why we said the column order is extremely important: thes
23.3333 The table's data storage can also become fragmented. However, data sto
23.0769 Covering indexes can be a very powerful tool, and can improve performa
23.0769 Because of the space and computational cost, you should use DECIMAL on
22.7848 This design may appeal to developers, because it lets them work in a c

====================== num_chars
902.0000 Another reason to create cache tables is for optimizing search and ret
720.0000 Both floating point and DECIMAL types let you specify the desired prec
691.0000 * Tables built upon clustered indexes are subject to page splits when
654.0000 Oddly enough, the first thing to decide is whether we have to use inde
637.0000 By now, you can probably see the pattern: if a user wants to see both
637.0000 However, it's easy to get into trouble by over optimizing queries. For
627.0000 If the last_online restriction appears without the age restriction, or
619.0000 Be very careful with completely “random” strings, such as those pr
617.0000 Once you choose a type, make sure you use the same type in all related
615.0000 This lets MySQL use indexes such as (active, sex, country, age). Of co
612.0000 We assume the employee's name is unique and can be used as a primary k
611.0000 An inexperienced user might think this identifies the column's role as
580.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
566.0000 Here's the trick: even if there's a query that doesn't restrict the re
566.0000 This query shows that increasing the prefix gives successively smaller
560.0000 The most common way to denormalize data is to duplicate, or cache, cer
557.0000 The easiest way to understand indexing concepts is with an illustratio
547.0000 If you require high performance, you must design your schema and index
541.0000 In most storage engines, an index can cover queries that access only c
538.0000 Some kinds of data don't correspond directly to a built in type. A tim
527.0000 When an index covers the query, you'll see “Using Index” in the Ex
525.0000 You may have noticed we're insisting on keeping the age column at the
520.0000 Unlike some other database servers, you can't choose which index to cl
516.0000 Because it's really up to the storage engines to store the data, not a
513.0000 Also in contrast to MyISAM, secondary indexes are very different from
512.0000 MySQL can sometimes use one index for both sorting and finding rows. T
511.0000 This example shows how InnoDB can lock rows it doesn't really need, ev
511.0000 For high concurrency workloads, inserting in primary key order can act
511.0000 If you need to store many true or false values, consider MySQL's nativ
510.0000 MySQL AB is working on improving this. Some of the upcoming improvemen
510.0000 This design may appeal to developers, because it lets them work in a c

====================== words_per_sentence
80.0000 Here's an example: if you design a product table with an ENUM field to
68.0000 The probability of a hash collision grows much faster than you might t
58.0000 You can find a good prefix length by selecting a sample of the most fr
52.5000 An inexperienced user might think this identifies the column's role as
52.0000 Adding and removing the LIMIT changes the query plan, which lets us tw
51.0000 A hash index is built on a hash table.<ref name="ftn5">See the compute
49.0000 Prefix indexes are a great way to make indexes smaller and faster in m
46.0000 Clustering the data by the primary key can be very beneficial to the a
42.0000 This works well because the MySQL query optimizer notices there's a sm
41.0000 Any application that keeps counts in a table can eventually run into c
41.0000 Even with the index, the query can be slow if the user interface is pa
40.0000 Now suppose we add another WHERE criterion that we know will reduce th
39.0000 The main reason to keep redundant indexes is when extending an existin
39.0000 The moral of the story is, you should strive to insert data in primary
38.0000 * Index entries are usually much smaller than the full row size, so My
38.0000 Another frequent performance nightmare is Object Relational Mapping (O
37.0000 # MySQL is using the index for an ORDER BY with a LIMIT. The index let
37.0000 For example, if you store the value b'00111001' (which is the binary e
36.5000 If someone talks about an index without mentioning a type, it's probab
36.0000 This index can help you find all people whose last name is Smith, and
35.5000 Redundant indexes usually appear when people add indexes to the table.
35.0000 When MySQL performs an index scan, it is accessing the table in index
35.0000 We've glossed over some of the details, such as how many internal B Tr
35.0000 This means you'll lose trailing spaces in CHAR columns, but not in VAR
34.5000 If your table has many rows and CRC32() gives too many collisions, imp
34.0000 Your only other options are offline engine specific repair utilities,
34.0000 Here's a second query that proves row 1 is locked, even though it didn
34.0000 The differences between clustered and non clustered data layouts, and
34.0000 The performance improvement from changing NULL columns to NOT NULL is
33.5000 The MySQL query optimizer uses two API calls to ask the storage engine
33.5000 Random values also cause caches to perform poorly for all types of que

====================== num_sentences
9.0000 Both floating point and DECIMAL types let you specify the desired prec
7.0000 Another reason to create cache tables is for optimizing search and ret
7.0000 Some kinds of data don't correspond directly to a built in type. A tim
6.0000 We assume the employee's name is unique and can be used as a primary k
6.0000 This command gives quite a lot of index information, which the MySQL m
6.0000 Here's the trick: even if there's a query that doesn't restrict the re
6.0000 A covering index can't be just any kind of index. The index must store
6.0000 Storage engines store B Tree indexes differently on disk. This can aff
6.0000 Sometimes you can use an ENUM column instead of conventional string ty
5.0000 In theory, MySQL could have skipped building a new table. The default
5.0000 The most common way to denormalize data is to duplicate, or cache, cer
5.0000 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
5.0000 Corrupted indexes can cause queries to return wrong results, or even c
5.0000 If the last_online restriction appears without the age restriction, or
5.0000 Oddly enough, the first thing to decide is whether we have to use inde
5.0000 The easiest way to understand indexing concepts is with an illustratio
5.0000 The first thing to look at is response time. If your queries are takin
5.0000 This example shows how InnoDB can lock rows it doesn't really need, ev
5.0000 In most storage engines, an index can cover queries that access only c
5.0000 For high concurrency workloads, inserting in primary key order can act
5.0000 Also in contrast to MyISAM, secondary indexes are very different from
5.0000 * Tables built upon clustered indexes are subject to page splits when
5.0000 Unlike some other database servers, you can't choose which index to cl
5.0000 You can often save space and get good performance by indexing the firs
5.0000 FULLTEXT is a special type of index for MyISAM tables. It finds keywor
5.0000 Maatkit ([http://maatkit.sourceforge.net/ http://maatkit.sourceforge.n
5.0000 Before MySQL 5.0, BIT is just a synonym for TINYINT. In MySQL 5.0 and
5.0000 TIMESTAMP also has special properties DATETIME doesn't have. By defaul
5.0000 However, there's another benefit to converting the columns. According
5.0000 MySQL supports quite a few string data types, with many variations on
5.0000 The DECIMAL type is for storing exact fractional numbers. In MySQL 5.0

====================== syllables_per_word
2.8750 * Normalized updates are usually faster than denormalized updates.
2.2500 InnoDB supports transactions and four transaction isolation levels.
2.1154 Good performance depends on both an optimal schema and optimal indexin
2.0500 We've covered most important data type considerations, some of them wi
2.0000 The table's data storage can also become fragmented. However, data sto
2.0000 If the Extra column of EXPLAIN contains “Using temporary,” the que
1.9697 Schemas are often designed from E R (entity relationship) diagrams or
1.9592 These limitations make hash indexes useful only in special cases. Howe
1.9583 These advantages can boost performance tremendously, if you design you
1.9545 * Covering indexes are especially helpful for InnoDB tables because of
1.9487 Your choice determines how MySQL stores the data, in memory and on dis
1.9412 The differences between clustered and non clustered data layouts, and
1.9333 * Pages become sparsely and irregularly filled because of splitting, s
1.9286 * Normalized tables are usually smaller, so they fit better in memory
1.9286 In addition to the Memory storage engine's explicit hash indexes, the
1.9241 This design may appeal to developers, because it lets them work in a c
1.9167 * Insert speed depends heavily on inserting in clustered key (primary
1.9048 InnoDB stores the same data very differently because of its clustered
1.8947 * Secondary (non clustered) indexes are larger because their leaf node
1.8919 It's harder for MySQL to optimize queries that refer to nullable colum
1.8873 Storage engines store B Tree indexes differently on disk. This can aff
1.8864 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
1.8750 Figure 4 TODO shows a simplified B Tree index's structure.
1.8667 Normalization generally means the degree to which each fact is represe
1.8571 DECIMAL math in MySQL 5.0 and newer uses complex emulation, because CP
1.8500 B Tree indexes can become fragmented, which reduces performance. Fragm
1.8485 Notice the auto incrementing integer primary key. The second case is a
1.8485 MySQL has a few storage types that use individual bits within a value
1.8421 We benchmarked two cases. The first is inserting into a userinfo table
1.8400 Sometimes you should go beyond the role of a developer, and question t
1.8387 The data type's complexity is also important. For example, integer dat

====================== kincaid
33.7525 Here's an example: if you design a product table with an ENUM field to
30.8859 The probability of a hash collision grows much faster than you might t
25.1369 You can find a good prefix length by selecting a sample of the most fr
24.1021 An inexperienced user might think this identifies the column's role as
23.1283 Clustering the data by the primary key can be very beneficial to the a
21.8843 A hash index is built on a hash table.<ref name="ftn5">See the compute
21.5812 Prefix indexes are a great way to make indexes smaller and faster in m
21.4550 * Normalized updates are usually faster than denormalized updates.
21.2554 Adding and removing the LIMIT changes the query plan, which lets us tw
20.5759 The differences between clustered and non clustered data layouts, and
19.6829 Any application that keeps counts in a table can eventually run into c
18.7932 Another frequent performance nightmare is Object Relational Mapping (O
18.4900 This works well because the MySQL query optimizer notices there's a sm
18.2277 The main reason to keep redundant indexes is when extending an existin
18.0325 Redundant indexes usually appear when people add indexes to the table.
17.9561 Even with the index, the query can be slow if the user interface is pa
17.8806 If someone talks about an index without mentioning a type, it's probab
17.8616 * Index entries are usually much smaller than the full row size, so My
17.7738 The moral of the story is, you should strive to insert data in primary
17.7100 Now suppose we add another WHERE criterion that we know will reduce th
17.5400 Another way to calculate a good prefix length is by computing selectiv
17.5400 If you need quick lookups on multi dimensional data from any combinati
17.4524 Here's a second query that proves row 1 is locked, even though it didn
17.2771 This means you'll lose trailing spaces in CHAR columns, but not in VAR
17.2004 The MySQL query optimizer uses two API calls to ask the storage engine
17.1053 The performance improvement from changing NULL columns to NOT NULL is
17.0184 For example, if you store the value b'00111001' (which is the binary e
16.9467 Be very careful with completely “random” strings, such as those pr
16.9400 We've glossed over some of the details, such as how many internal B Tr
16.8783 These advantages can boost performance tremendously, if you design you
16.5891 There's a way to work around both problems with a combination of cleve

====================== syllables per sentence
30 There is typically no reason to do so unless you want to have different types of indexes on the same column to satisfy different kinds of queriesref nameftn12An index is not necessarily a duplicate if its a different type of index there are often good reasons to have KEYcol and FULLTEXT KEYnameref
29 Heres an example if you design a product table with an ENUM field to define the product type you might want a lookup table primary keyed on an identical ENUM field You could add columns to the lookup table for descriptive text to generate a glossary or to provide meaningful labels in a pull down menu on a website In this case youll want to use the ENUM as an identifier but for most purposes you should avoid doing so
26 There is currently no way to de fragment InnoDB indexes as they cant be built by a sort in MySQL 50ref nameftn16The InnoDB developers are working on this problem at the time of writingref Even dropping and re creating them may result in fragmented indexes depending on the data
26 We suggest you think carefully before trading performance for developer productivity and always test on a realistically large data set so you dont discover performance problems too late
24 They slow INSERT because the inserted value has to go in a random location in indexes which can cause excessive page splits and random disk accesses and causes clustered index fragmentation for clustered storage engines such as InnoDB
23 These indexes will satisfy the most frequently specified search queries but how can we design indexes for less common options such as has_pictures eye_color hair_color and education
23 It shows average selectivity but its also important to account for worst case selectivity which will be different from the average unless the data is evenly distributed
23 When you add indexes try to extend existing indexes instead because it is usually more efficient to maintain one multi column index than several single column indexes
22 In most cases you dont want redundant indexes and should extend existing indexes rather than adding new ones but it is not as clear cut as with duplicate indexesthere are times when you need redundant indexes for performance reasons
22 The differences between clustered and non clustered data layouts and the corresponding differences between primary and secondary indexes can be confusing and surprising
22 This illustrates a general principle when youre designing indexes keep in mind not only the kinds of indexes you need for existing queries but consider the queries to be candidates for optimization too
22 This is a technique that makes indexes much faster to build and results in a compact index layout InnoDB currently builds its indexes a row at a time in primary key order which means the index trees arent built in optimal order and are fragmented
21 Integer types can optionally have the UNSIGNED attribute which disallows negative values and approximately doubles the upper limit of positive values you can store
21 For high concurrency workloads inserting in primary key order can actually create a single point of contention in InnoDB as it is currently implemented
21 If MySQL knows itll need to access the row at any point while executing the query MySQL will always read the roweven when a better query plan is theoretically possibleand the index wont cover the queryref nameftn9This may change in future MySQL versions but it applies as of MySQL 50 and 51ref
21 Because of the space and computational cost you should use DECIMAL only when you need exact results for fractional numbersfor example when performing financial operations
20 If you need quick lookups on multi dimensional data from any combination of dimensions you might consider storing non spatial data into a geospatial type just so you can have spatial indexes
20 For example if there is no index on category and theres an index on price a filesort may be better or worse than an index scan depending on the selectivity of the category column
20 MyISAM uses prefix compression to reduce index size allowing more of the index to fit in memory and improving performance dramatically in some cases
20 These queries often need a particular table and index structure which is different from the one you would use for general OLTP online transaction processing operations
20 Example 2 where the second condition filter leaves only a small set of results after index filtering shows how effective the proposed optimization is five times better on our data
20 Weve covered most important data type considerations some of them with serious performance implications others with just minor performance effects
20 This is true in general adding new indexes may have a dramatic performance impact for INSERT UPDATE and DELETE operations especially if the new index causes you to hit memory limits
20 Try to avoid the common mistake of creating indexes without knowing which queries will use them and consider whether all indexes together are an optimal configuration
19 This type of fragmentation only affects some operations such as full table scans and clustered index range scans which normally benefit from a sequential data layout on disk
19 Refer to computer science literature for a detailed explanation of B Tree indexesref This is the only index type all of MySQLs storage engines support
19 We mention some other strategies to optimize counter operations on page TODO application level optimization I think counters in memcache and updating a row with RAND
19 The optimizer uses the statistics to estimate the number of rows a query will examine which is important because MySQLs optimizer is cost based and the main cost metric is how much data the query will access
18 For example DECIMAL18 9 will store 9 digits from each side of the decimal point using 9 bytes in total 4 for the digits before the decimal point one for the decimal point itself and 4 bytes for the digits after decimal point
18 If someone talks about an index without mentioning a type its probably a B Tree index which uses a B Tree data structure to store its dataref nameftn3Its actually a BTree index since each leaf node contains a link to the next for fast range traversals through nodes
18 MySQL can perform prefix match LIKE patterns in the index because MySQL can convert them to simple comparisons but the leading wildcard in the query makes it impossible for the storage engine to evaluate the match

====================== worst overall
7 An inexperienced user might think this identifies the column's role as
6 The differences between clustered and non clustered data layouts, and
5 This design may appeal to developers, because it lets them work in a c
5 * Normalized updates are usually faster than denormalized updates.
5 Be very careful with completely “random” strings, such as those pr
5 These advantages can boost performance tremendously, if you design you
5 Clustering the data by the primary key can be very beneficial to the a
4 We assume the employee's name is unique and can be used as a primary k
4 The easiest way to understand indexing concepts is with an illustratio
4 * Tables built upon clustered indexes are subject to page splits when
4 Unfortunately, it doesn't work for unique indexes, because DISABLE KEY
4 Another frequent performance nightmare is Object Relational Mapping (O
4 Some kinds of data don't correspond directly to a built in type. A tim
4 Here's an example: if you design a product table with an ENUM field to
4 Another reason to create cache tables is for optimizing search and ret
4 Redundant indexes usually appear when people add indexes to the table.
4 Prefix indexes are a great way to make indexes smaller and faster in m
4 We've covered most important data type considerations, some of them wi
4 The tools for de fragmenting data and indexes are OPTIMIZE TABLE, a no
4 If the last_online restriction appears without the age restriction, or
4 The most common way to denormalize data is to duplicate, or cache, cer
4 Good performance depends on both an optimal schema and optimal indexin
4 The probability of a hash collision grows much faster than you might t
4 Any application that keeps counts in a table can eventually run into c
4 You can find a good prefix length by selecting a sample of the most fr
4 * Covering indexes are especially helpful for InnoDB tables because of
4 Storage engines store B Tree indexes differently on disk. This can aff
4 Both floating point and DECIMAL types let you specify the desired prec
4 Schemas are often designed from E R (entity relationship) diagrams or
4 In most storage engines, an index can cover queries that access only c
3 Here's the trick: even if there's a query that doesn't restrict the re

## clean_text.pl
#!/usr/bin/perl

use strict;
use warnings FATAL => 'all';

use English qw(-no_match_vars);

my $filename = shift or die "Need a filename";

open my $fh, "<", $filename or die $OS_ERROR;
(my $fileout = $filename) =~ s/.txt/-clean.txt/;
open my $fo, ">", $fileout  or die $OS_ERROR;

my $wasblank;
while ( my $line = <$fh> ) {
   chomp $line;
   if ( !$line ) {
      $wasblank = 1;
      print $fo "\n" unless $wasblank;
      next;
   }
   next if $line =~ m/nowiki|<center>|'''|\||^ /;
   next unless $line =~ m/\s/ && $line =~ m/\./;
   $line =~ s!</?tt>|''!!g;
   $line =~ s/\n\n+/\n/g;
   $line =~ s/(\w)-(\w)/$1 $2/g;
   print $fo $line, "\n\n";
   $wasblank = 0;
}

close $fh;
close $fo;
	#!/usr/bin/perl

	use strict;
	use warnings FATAL => 'all';

	use English qw(-no_match_vars);
	use Lingua::EN::Fathom;
	use TeX::Hyphen;
	use List::Util qw(min);

	my $filename = shift or die "Give me a filename";

	my $hyp = new TeX::Hyphen();
	# file => 'hyphen.tex',
	# style => 'czech',
	# leftmin => 2,
	# rightmin => 2,
	# my @points = $hyp->hyphenate($word);

	my $f = new Lingua::EN::Fathom;

	# First print out a report on the whole file.
	$f->analyse_file($filename);
	print($f->report);

	=pod
	Number of characters : 65102
	Number of words : 10707
	Percent of complex words : 15.36
	Average syllables per word : 1.6678
	Number of sentences : 579
	Average words per sentence : 18.4922
	Number of text lines : 246
	Number of blank lines : 248
	Number of paragraphs : 246


	READABILITY INDICES

	Fog : 13.5424
	Flesch : 46.9706
	Flesch-Kincaid : 11.3019
	=cut

	=pod
	# Print out the longest words and their frequency.
	my %words = $f->unique_words;
	my @longest = reverse sort {
	length($a) * $words{$a} <=> length($b) * $words{$b}
	} keys %words;
	@longest = grep { length($_) > 4 } @longest;
	foreach my $i ( 0 .. min(30, $#longest ) ) {
	printf("%-30s %5d\n", $longest[$i], $words{$longest[$i]});
	}
	=cut

	# Now find the longest and least readable paragraphs, and the sentences with the
	# most syllables. These metrics are a 1 if more is better, and a 0 else.
	my %metrics = (
	num_chars => 0,
	num_words => 0,
	percent_complex_words => 0,
	num_sentences => 0,
	syllables_per_word => 0,
	words_per_sentence => 0,
	fog => 0,
	flesch => 1,
	kincaid => 0,
	);

	my @stats;
	my %syll_for;
	open my $fi, "<", $filename or die $OS_ERROR;
	while ( my $line = <$fi> ) {
	chomp $line;
	next unless $line;

	my $st = { line => $line };
	$f->analyse_block($line);
	foreach my $metric ( keys %metrics ) {
	$st->{$metric} = $f->$metric();
	}

	# My own metric: syllables per paragraph and syllables per sentence.
	$st->{tex_syllables} = 0;
	foreach my $sentence ( split(/[.!?]\s+(?=[A-Z])/, $line ) ) {
	$sentence =~ s/[^\w\s]+//g;
	foreach my $word ( split(/\s+/, $sentence ) ) {
	my $hy = $hyp->hyphenate($word);
	$syll_for{$sentence} += $hy;
	$st->{tex_syllables} += $hy;
	}
	}

	push @stats, $st;
	}
	close $fi;

	my %worst_overall;

	foreach my $metric ( qw(tex_syllables), keys %metrics ) {
	my @top = sort { $a->{$metric} <=> $b->{$metric} } @stats;
	if ( !$metrics{$metric} ) {
	@top = reverse @top;
	}
	if ( $metric eq 'flesch' ) {
	@top = grep { $_->{flesch} != 0 } @top;
	}
	print "====================== $metric\n";
	foreach my $i ( 0 .. min(30, $#top) ) {
	my $st = $top[$i];
	printf("%6.4f %s\n", $st->{$metric}, substr($st->{line}, 0, 70));
	$worst_overall{$st->{line}}++;
	}
	print "\n";
	}

	# Syllables-per-sentence report.
	my @top = reverse sort { $syll_for{$a} <=> $syll_for{$b} } keys %syll_for;
	print "====================== syllables per sentence\n";
	foreach my $i ( 0 .. min(30, $#top) ) {
	my $sent = $top[$i];
	print $syll_for{$sent}, ' ', $sent, "\n";
	}

	# Lines that showed up as bad in most of the Fathom reports.
	@top = reverse sort { $worst_overall{$a} <=> $worst_overall{$b} } keys %worst_overall;
	print "\n====================== worst overall\n";
	foreach my $i ( 0 .. min(30, $#top) ) {
	my $sent = $top[$i];
	print $worst_overall{$sent}, ' ', substr($sent, 0, 70), "\n";
	}