Skip to content

Instantly share code, notes, and snippets.

@Erreinion
Last active November 24, 2023 22:35
Show Gist options
  • Save Erreinion/6691093 to your computer and use it in GitHub Desktop.
Save Erreinion/6691093 to your computer and use it in GitHub Desktop.
HTMLClipboard with updates for Python 3. Includes exception handling for locked clipboard. Improvement of: http://code.activestate.com/recipes/474121/
"""
Created on Sep 24, 2013
@author: RandomHardcoreJerks
Requires pywin32
original: http://code.activestate.com/recipes/474121/
# HtmlClipboard
# An interface to the "HTML Format" clipboard data format
__author__ = "Phillip Piper (jppx1[at]bigfoot.com)"
__date__ = "2006-02-21"
__version__ = "0.1"
"""
import re
import time
import random
import win32clipboard
#---------------------------------------------------------------------------
# Convenience functions to do the most common operation
def HasHtml():
"""
Return True if there is a Html fragment in the clipboard..
"""
cb = HtmlClipboard()
return cb.HasHtmlFormat()
def GetHtml():
"""
Return the Html fragment from the clipboard or None if there is no Html in the clipboard.
"""
cb = HtmlClipboard()
if cb.HasHtmlFormat():
return cb.GetFragment()
else:
return None
def PutHtml(fragment):
"""
Put the given fragment into the clipboard.
Convenience function to do the most common operation
"""
cb = HtmlClipboard()
cb.PutFragment(fragment)
#---------------------------------------------------------------------------
class HtmlClipboard:
CF_HTML = None
MARKER_BLOCK_OUTPUT = \
"Version:1.0\r\n" \
"StartHTML:%09d\r\n" \
"EndHTML:%09d\r\n" \
"StartFragment:%09d\r\n" \
"EndFragment:%09d\r\n" \
"StartSelection:%09d\r\n" \
"EndSelection:%09d\r\n" \
"SourceURL:%s\r\n"
MARKER_BLOCK_EX = \
"Version:(\S+)\s+" \
"StartHTML:(\d+)\s+" \
"EndHTML:(\d+)\s+" \
"StartFragment:(\d+)\s+" \
"EndFragment:(\d+)\s+" \
"StartSelection:(\d+)\s+" \
"EndSelection:(\d+)\s+" \
"SourceURL:(\S+)"
MARKER_BLOCK_EX_RE = re.compile(MARKER_BLOCK_EX)
MARKER_BLOCK = \
"Version:(\S+)\s+" \
"StartHTML:(\d+)\s+" \
"EndHTML:(\d+)\s+" \
"StartFragment:(\d+)\s+" \
"EndFragment:(\d+)\s+" \
"SourceURL:(\S+)"
MARKER_BLOCK_RE = re.compile(MARKER_BLOCK)
DEFAULT_HTML_BODY = \
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">" \
"<HTML><HEAD></HEAD><BODY><!--StartFragment-->%s<!--EndFragment--></BODY></HTML>"
def __init__(self):
self.html = None
self.fragment = None
self.selection = None
self.source = None
self.htmlClipboardVersion = None
def GetCfHtml(self):
"""
Return the FORMATID of the HTML format
"""
if self.CF_HTML is None:
self.CF_HTML = win32clipboard.RegisterClipboardFormat("HTML Format")
return self.CF_HTML
def GetAvailableFormats(self):
"""
Return a possibly empty list of formats available on the clipboard
"""
formats = []
try:
win32clipboard.OpenClipboard(0)
cf = win32clipboard.EnumClipboardFormats(0)
while (cf != 0):
formats.append(cf)
cf = win32clipboard.EnumClipboardFormats(cf)
finally:
win32clipboard.CloseClipboard()
return formats
def HasHtmlFormat(self):
"""
Return a boolean indicating if the clipboard has data in HTML format
"""
return (self.GetCfHtml() in self.GetAvailableFormats())
def GetFromClipboard(self):
"""
Read and decode the HTML from the clipboard
"""
# implement fix from: http://teachthe.net/?p=1137
cbOpened = False
while not cbOpened:
try:
win32clipboard.OpenClipboard(0)
src = win32clipboard.GetClipboardData(self.GetCfHtml())
src = src.decode("UTF-8")
#print(src)
self.DecodeClipboardSource(src)
cbOpened = True
win32clipboard.CloseClipboard()
except Exception as err:
# If access is denied, that means that the clipboard is in use.
# Keep trying until it's available.
if err.winerror == 5: # Access Denied
pass
# wait on clipboard because something else has it. we're waiting a
# random amount of time before we try again so we don't collide again
time.sleep( random.random()/50 )
elif err.winerror == 1418: # doesn't have board open
pass
elif err.winerror == 0: # open failure
pass
else:
print( 'ERROR in Clipboard section of readcomments: %s' % err)
pass
def DecodeClipboardSource(self, src):
"""
Decode the given string to figure out the details of the HTML that's on the string
"""
# Try the extended format first (which has an explicit selection)
matches = self.MARKER_BLOCK_EX_RE.match(src)
if matches:
self.prefix = matches.group(0)
self.htmlClipboardVersion = matches.group(1)
self.html = src[int(matches.group(2)):int(matches.group(3))]
self.fragment = src[int(matches.group(4)):int(matches.group(5))]
self.selection = src[int(matches.group(6)):int(matches.group(7))]
self.source = matches.group(8)
else:
# Failing that, try the version without a selection
matches = self.MARKER_BLOCK_RE.match(src)
if matches:
self.prefix = matches.group(0)
self.htmlClipboardVersion = matches.group(1)
self.html = src[int(matches.group(2)):int(matches.group(3))]
self.fragment = src[int(matches.group(4)):int(matches.group(5))]
self.source = matches.group(6)
self.selection = self.fragment
def GetHtml(self, refresh=False):
"""
Return the entire Html document
"""
if not self.html or refresh:
self.GetFromClipboard()
return self.html
def GetFragment(self, refresh=False):
"""
Return the Html fragment. A fragment is well-formated HTML enclosing the selected text
"""
if not self.fragment or refresh:
self.GetFromClipboard()
return self.fragment
def GetSelection(self, refresh=False):
"""
Return the part of the HTML that was selected. It might not be well-formed.
"""
if not self.selection or refresh:
self.GetFromClipboard()
return self.selection
def GetSource(self, refresh=False):
"""
Return the URL of the source of this HTML
"""
if not self.selection or refresh:
self.GetFromClipboard()
return self.source
def PutFragment(self, fragment, selection=None, html=None, source=None):
"""
Put the given well-formed fragment of Html into the clipboard.
selection, if given, must be a literal string within fragment.
html, if given, must be a well-formed Html document that textually
contains fragment and its required markers.
"""
if selection is None:
selection = fragment
if html is None:
html = self.DEFAULT_HTML_BODY % fragment
if source is None:
source = "file://HtmlClipboard.py"
fragmentStart = html.index(fragment)
fragmentEnd = fragmentStart + len(fragment)
selectionStart = html.index(selection)
selectionEnd = selectionStart + len(selection)
self.PutToClipboard(html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source)
def PutToClipboard(self, html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source="None"):
"""
Replace the Clipboard contents with the given html information.
"""
try:
win32clipboard.OpenClipboard(0)
win32clipboard.EmptyClipboard()
src = self.EncodeClipboardSource(html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source)
src = src.encode("UTF-8")
#print(src)
win32clipboard.SetClipboardData(self.GetCfHtml(), src)
finally:
win32clipboard.CloseClipboard()
def EncodeClipboardSource(self, html, fragmentStart, fragmentEnd, selectionStart, selectionEnd, source):
"""
Join all our bits of information into a string formatted as per the HTML format specs.
"""
# How long is the prefix going to be?
dummyPrefix = self.MARKER_BLOCK_OUTPUT % (0, 0, 0, 0, 0, 0, source)
lenPrefix = len(dummyPrefix)
prefix = self.MARKER_BLOCK_OUTPUT % (lenPrefix, len(html)+lenPrefix,
fragmentStart+lenPrefix, fragmentEnd+lenPrefix,
selectionStart+lenPrefix, selectionEnd+lenPrefix,
source)
return (prefix + html)
def DumpHtml():
cb = HtmlClipboard()
print("GetAvailableFormats()=%s" % str(cb.GetAvailableFormats()))
print("HasHtmlFormat()=%s" % str(cb.HasHtmlFormat()))
if cb.HasHtmlFormat():
cb.GetFromClipboard()
print("prefix=>>>%s<<<END" % cb.prefix)
print("htmlClipboardVersion=>>>%s<<<END" % cb.htmlClipboardVersion)
print("GetSelection()=>>>%s<<<END" % cb.GetSelection())
print("GetFragment()=>>>%s<<<END" % cb.GetFragment())
print("GetHtml()=>>>%s<<<END" % cb.GetHtml())
print("GetSource()=>>>%s<<<END" % cb.GetSource())
if __name__ == '__main__':
def test_SimpleGetPutHtml():
data = "<p>Writing to the clipboard is <strong>easy</strong> with this code.</p>"
PutHtml(data)
if GetHtml() == data:
print("passed")
else:
print("failed")
test_SimpleGetPutHtml()
#DumpHtml()
@Decimaldelimiter
Copy link

Decimaldelimiter commented Nov 22, 2023

Hi!

Thank you for the code.
Please be aware: if you have special characters in the original text, then len(src) will be less as len( src.encode("UTF-8") ).

So the current calculation of fragmentEnd = fragmentStart + len(fragment) and selectionEnd = selectionStart + len(selection) in PutFragment method may be wrong.

As result, the clipboard will contain wrong ranges.

Reproducing with following:

        data =  "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"\
                "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"\
                "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"\
                "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"\
                "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"\
                "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"\
                "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"\
                "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"\
                "<p>Writing ████████████ to the clipboard is <strong>easy</strong> with this code.</p>"

The run results with "passed", but if you try to paste your clipboard e.g. into a Word document, you will receive only 7 HTML lines instead of given 9.

A simple fix for PutFragment will be to change:

        #fragmentEnd = fragmentStart + len(fragment)
        fragmentEnd = fragmentStart + len(fragment.encode("UTF-8"))

...

        #selectionEnd = selectionStart + len(selection)
        selectionEnd = selectionStart + len(selection.encode("UTF-8"))

Additionally, in EncodeClipboardSource:

        lenHtml = len(html.encode("UTF-8"))

        prefix = self.MARKER_BLOCK_OUTPUT % (lenPrefix, lenHtml+lenPrefix,
                        fragmentStart+lenPrefix, fragmentEnd+lenPrefix,
                        selectionStart+lenPrefix, selectionEnd+lenPrefix,
                        source)

Now, the ranges in clipboard object are correct calculated based on the length of byte array and the fragment will be correct pasted into Word.

In the same way, the GetFromClipboard must be redesigned a very bit. I did it as following for me:

    def DecodeClipboardSource(self, src):
        """
        Decode the given string to figure out the details of the HTML that's on the string
        """
        
        parseSrc = src.decode("UTF-8")

        startHtml = 0
        endHtml = 0
        startFragment = 0
        endFragment = 0
        startSelection = 0
        endSelection = 0
        
        # Try the extended format first (which has an explicit selection)
        matches = self.MARKER_BLOCK_EX_RE.match(parseSrc)
        if matches:
            self.prefix = matches.group(0)
            self.htmlClipboardVersion = matches.group(1)
            
            startHtml = int(matches.group(2))
            endHtml = int(matches.group(3))
            startFragment = int(matches.group(4))
            endFragment = int(matches.group(5))
            startSelection = int(matches.group(6))
            endSelection = int(matches.group(7))            
            
            #self.html = src[int(matches.group(2)):int(matches.group(3))]
            #self.fragment = src[int(matches.group(4)):int(matches.group(5))]
            #self.selection = src[int(matches.group(6)):int(matches.group(7))]
            
            self.source = matches.group(8)
        else:
            # Failing that, try the version without a selection
            matches = self.MARKER_BLOCK_RE.match(parseSrc)
            if matches:
                self.prefix = matches.group(0)
                self.htmlClipboardVersion = matches.group(1)

                startHtml = int(matches.group(2))
                endHtml = int(matches.group(3))
                startFragment = int(matches.group(4))
                endFragment = int(matches.group(5))
                startSelection = startFragment
                endSelection = endFragment        

                #self.html = src[int(matches.group(2)):int(matches.group(3))]
                #self.fragment = src[int(matches.group(4)):int(matches.group(5))]
                #self.selection = self.fragment
                
                self.source = matches.group(6)

        self.html = src[startHtml:endHtml].decode("UTF-8")
        self.fragment = src[startFragment:endFragment].decode("UTF-8")
        self.selection = src[startSelection:endSelection].decode("UTF-8")

And in GetFromClipboard, I commented out the call of decode but passed the bytes to DecodeClipboardSource:

                #src = src.decode("UTF-8")
                #print(src)
                self.DecodeClipboardSource(src)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment